## Case Study: Finding protein-protein interaction pathways of DNA replication factor CDT1

For more information on DNA replication factor CDT1, reference https://www.ncbi.nlm.nih.gov/protein/NP_112190.2

In [1]:
import numpy as np

DESIRED_PROTEIN_NAME = 'CDT1'

In [2]:
def find_all_elements(arr, target):
    arr = np.array(arr)

    occurrences = []
    for i in range(len(arr)):
        for j in range(len(arr[i])):
            if arr[i][j] == target:
                occurrences.append((i, j))  # Append the indices of the element if found
    indices = np.array(occurrences)
    return arr[indices[:,0],:]

In [3]:
def print_first_n_lines(data, n):
    data = np.array(data)
    print(f"Displaying first {n} rows of {data.shape[0]}:")
    for row in data[:n]:
        if isinstance(row, tuple):
            print(*row)
        else:
            print(row)
    print(f"... {data[:,0].size - n} more rows")

### 1) Import ppi confidence values from HIPPIE dataset (```*.mitab```)

#### Human Integrated Protein-Protein Interaction rEference
Download ```HIPPIE tab format``` from https://cbdm-01.zdv.uni-mainz.de/~mschaefer/hippie/download.php as ```HIPPIE-current.mitab```

In [4]:
import parse_mitab

ppis = parse_mitab.ppis
desired_ppis = find_all_elements(ppis, DESIRED_PROTEIN_NAME)
print_first_n_lines(desired_ppis, n=5)

Selected C:/Users/enoch/OneDrive/Documents/GitHub/ppi-prediction-goa/assets/HIPPIE-current.mitab.txt
Displaying first 5 lines:
['AL1A1' 'AL1A1' '0.76']
['ITA7' 'ACHA' '0.73']
['NEB1' 'ACTG' '0.65']
['SRGN' 'CD44' '0.63']
['GRB7' 'ERBB2' '0.9']
Displaying first 5 rows of 79:
['CDT1' 'ORC2' '0.76']
['CDT1' 'SKP2' '0.9']
['CDT1' 'MCM6' '0.97']
['CDT1' 'ORC1' '0.7']
['CDT1' 'CDC6' '0.89']
... 74 more rows


### 2) Import gene ontology annotations of *Homo sapiens* (```*.gaf```)

Download the following entry:
* Species: Homo sapiens
* Database: EBI Gene Ontology Annotation Database (goa)
* Entity type: protein
* Annotations: 638780

from https://current.geneontology.org/products/pages/downloads.html as ```goa_human.gaf```

In [5]:
import parse_gaf

gaf_data = parse_gaf.gaf_data
desired_cols = [2, 3, 4, 6, 8, 13]
human_prots = [[row[i] for i in desired_cols] for row in gaf_data]

go_curr_protein = find_all_elements(human_prots, DESIRED_PROTEIN_NAME)
print_first_n_lines(go_curr_protein, n=5)

Selected C:/Users/enoch/OneDrive/Documents/GitHub/ppi-prediction-goa/assets/goa_human.gaf
Displaying first 5 lines:
['UniProtKB', 'A0A024RBG1', 'NUDT4B', 'enables', 'GO:0003723', 'GO_REF:0000043', 'IEA', 'UniProtKB-KW:KW-0694', 'F', 'Diphosphoinositol polyphosphate phosphohydrolase NUDT4B', 'NUDT4B', 'protein', 'taxon:9606', '20231122', 'UniProt']
['UniProtKB', 'A0A024RBG1', 'NUDT4B', 'enables', 'GO:0046872', 'GO_REF:0000043', 'IEA', 'UniProtKB-KW:KW-0479', 'F', 'Diphosphoinositol polyphosphate phosphohydrolase NUDT4B', 'NUDT4B', 'protein', 'taxon:9606', '20231122', 'UniProt']
['UniProtKB', 'A0A024RBG1', 'NUDT4B', 'located_in', 'GO:0005829', 'GO_REF:0000052', 'IDA', '', 'C', 'Diphosphoinositol polyphosphate phosphohydrolase NUDT4B', 'NUDT4B', 'protein', 'taxon:9606', '20230619', 'HPA']
['UniProtKB', 'A0A075B6H7', 'IGKV3-7', 'involved_in', 'GO:0002250', 'GO_REF:0000043', 'IEA', 'UniProtKB-KW:KW-1064', 'P', 'Probable non-functional immunoglobulin kappa variable 3-7', 'IGKV3-7', 'protein'

In [6]:
go_terms_for_prot = set(go_curr_protein[:,2])
num_go_terms = len(go_terms_for_prot)

go_terms_for_prot = np.array(list(go_terms_for_prot))
go_terms_for_prot = go_terms_for_prot.reshape(-1, 1)
print_first_n_lines(go_terms_for_prot, n=5)

Displaying first 5 rows of 22:
['GO:0035563']
['GO:0000076']
['GO:0005654']
['GO:0071163']
['GO:0033262']
... 17 more rows


### 3) Import the Gene Ontology database (```*.obo```)

Download ```go-basic.obo``` from https://current.geneontology.org/ontology/go-basic.obo

Then, separately run ```go-basic.obo``` in ```obo_to_txt.py``` and get the output file, ```go-basic.txt```

In [None]:
from conversion import obo_to_txt
import process_GO_terms_txt

go_terms = process_GO_terms_txt.go_terms
go_terms = np.array(go_terms)
print_first_n_lines(go_terms, n=5)

In [None]:
collisions = []

for i in range(go_terms[:,0].size):
    if np.any(go_terms_for_prot == go_terms[i,16]):
        collisions.append(go_terms[i,:])

print_first_n_lines(collisions, n=5)