## Tesing networks data

### As of 2020 Oct 6, the new Intact data is available for testing

In [20]:
%%bash 


curl -s ftp://ftp.ebi.ac.uk/pub/databases/intact/various/ot_graphdb/2020-10-05/data/interactor_pair_interactions.json \
    | gzip > /Users/dsuveges/project/evidences/2020.10.06.interactor_pair_interactions.json.gz
    
gzcat /Users/dsuveges/project/evidences/2020.10.06.interactor_pair_interactions.json.gz | wc -l

  408127


In [21]:
import json
import pandas as pd
import gzip

intact_file = '/Users/dsuveges/project/evidences/2020.10.06.interactor_pair_interactions.json.gz'

parsed_interaction_data = []
parsed_evidence_data = []

# OPen file and read line by line, extract info, build dataframe
with gzip.open(intact_file) as f:
    for row in f:
        data = json.loads(row)
        
        # 
        interaction_data = {
            'int_A_id': data['interactorA']['id'],
            'int_A_source': data['interactorA']["id_source"],
            'int_A_organism': data['interactorA']["organism"]['mnemonic'],
            'int_A_biological_role': data['interactorA']["biological_role"],
            'source': data['source_info']['source_database'],
            'causal': data['interaction']['causal_interaction']
        }
        
        try:
            interaction_data.update({
                'int_B_id': data['interactorB']['id'],
                'int_B_source': data['interactorB']["id_source"],
                'int_B_organism': data['interactorB']["organism"]['mnemonic'],
                'int_B_biological_role': data['interactorB']["biological_role"],
            })
        except:
            interaction_data.update({
                'int_B_id': None,
                'int_B_source':None,
                'int_B_organism': None
            })
            
        # Adding interaction to list:
        parsed_interaction_data.append(interaction_data)
        
        # Extract evidence data:
        for evidence in data['interaction']['evidence']:
            evidnece_data = {
                'pmid': evidence['pubmed_id'],
                'interaction_type': evidence['interaction_type_short_name'],
                'interaction_id': evidence['interaction_identifier'],
                'interaction_detection_method': f"{evidence['interaction_detection_method_short_name']} ({evidence['interaction_detection_method_mi_identifier']})"
            }
            
            # Adding participant detection methods:
            if  isinstance(evidence['participant_detection_method_A'], list):
                evidnece_data['participant_detection_method_A'] = [f'{x["short_name"]} ({x["mi_identifier"]})' for x in evidence['participant_detection_method_A']]
            if isinstance(evidence['participant_detection_method_B'], list):
                evidnece_data['participant_detection_method_B'] = [f'{x["short_name"]} ({x["mi_identifier"]})' for x in evidence['participant_detection_method_B']]
            
            evidnece_data.update(interaction_data)
            parsed_evidence_data.append(evidnece_data)
            
intact_df = pd.DataFrame(parsed_interaction_data)

print(f'Number of evidence: {len(intact_df)}')
print(f'Number of unique interactions: {len(intact_df[["int_A_id","int_B_id","int_A_biological_role","int_B_biological_role","source", "causal"]].drop_duplicates())}')

Number of evidence: 746474
Number of unique interactions: 406311


In [18]:
pd.DataFrame(parsed_interaction_data)

Unnamed: 0,pmid,interaction_type,interaction_id,interaction_detection_method,participant_detection_method_A,participant_detection_method_B,int_A_id,int_A_source,int_A_organism,int_A_biological_role,source,causal,int_B_id,int_B_source,int_B_organism,int_B_biological_role
0,24835392,direct interaction,EBI-12684777,elisa (MI:0411),[antibody detection (MI:0421)],[antibody detection (MI:0421)],A0A024A2C9,uniprotkb,haeif,unspecified role,intact,,P08603-2,uniprotkb,human,unspecified role
1,32296183,physical association,EBI-24521810,validated two hybrid (MI:1356),[nucleotide sequence (MI:0078)],[nucleotide sequence (MI:0078)],A0A024R0L9,uniprotkb,human,unspecified role,intact,,Q93062-3,uniprotkb,human,unspecified role
2,32296183,physical association,EBI-23426250,two hybrid array (MI:0397),[nucleotide sequence (MI:0078)],[nucleotide sequence (MI:0078)],A0A024R0L9,uniprotkb,human,unspecified role,intact,,Q93062-3,uniprotkb,human,unspecified role
3,32296183,physical association,EBI-23201216,two hybrid prey pooling approach (MI:1112),[nucleotide sequence (MI:0078)],[nucleotide sequence (MI:0078)],A0A024R0L9,uniprotkb,human,unspecified role,intact,,Q93062-3,uniprotkb,human,unspecified role
4,17353931,physical association,EBI-1081478,anti bait coip (MI:0006),[sequence tag (MI:0102)],[sequence tag (MI:0102)],A0A024R493,uniprotkb,human,unspecified role,intact,,Q07283,uniprotkb,human,unspecified role
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746469,32227113,phosphorylation,EBI-25609708,protein kinase assay (MI:0424),"[experimental particp (MI:0661), weight autora...",,Q9Y4K4,uniprotkb,human,putative self,intact,,,,,
746470,19524513,phosphorylation,EBI-2367394,protein kinase assay (MI:0424),[weight autoradiogra (MI:0821)],,Q9Y572,uniprotkb,human,putative self,intact,,,,,
746471,17657516,phosphorylation,EBI-1572954,protein kinase assay (MI:0424),[predetermined (MI:0396)],,Q9Y6E0,uniprotkb,human,putative self,intact,,,,,
746472,32227113,phosphorylation,EBI-25609688,protein kinase assay (MI:0424),"[experimental particp (MI:0661), weight autora...",,Q9Y6E0-2,uniprotkb,human,putative self,intact,,,,,
