In [55]:
from zipfile import ZipFile

import os
import pandas as pd
import networkx as nx

# Extract from zip file

In [2]:
DATA_DIRECTORY = "/Users/markusyoussef/Desktop/git/supplements/data/"
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
PPI_DIRECTORY = f"{DATA_DIRECTORY}/processed_data/PPIs"

VERSION = "3.5.184"
BioGRID_FILENAME = f"BIOGRID-ORGANISM-{VERSION}.tab3.zip"
BioGRID_URL = f"https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-{VERSION}"
BioGRID_FILEPATH = f"{RAW_DATA_DIRECTORY}/{BioGRID_FILENAME}" 

In [16]:
# Unzip Saccharomyces cerevisiae file
with ZipFile(BioGRID_FILEPATH, 'r') as z:
    for name in z.namelist():
        z.extract(name, PPI_DIRECTORY)

# Read as dataframes

In [17]:
organism_code = {
    'cerevisiae': 559292,
    'sapiens': 9606
}

In [67]:
for file in os.listdir(PPI_DIRECTORY):
    organism = file.split('-')[2]
    df = pd.read_csv(f"{PPI_DIRECTORY}/{file}", 
                     delimiter  = '\t',
                     low_memory = False)
    organism_code = max(df.groupby('Organism Interactor A'), key = lambda x:len(x[1]))[0]
    df = df[(df['Organism Interactor A'] == organism_code) & 
            (df['Organism Interactor B'] == organism_code) &
            (df['Experimental System Type'] == 'physical')]
    
    G = nx.from_pandas_edgelist(df, source = 'BioGRID ID Interactor A', 
                                    target = 'BioGRID ID Interactor B')
    
    N = G.number_of_nodes()
    E = G.number_of_edges()
    
    print(N, E, organism)
    
    nx.write_edgelist(G, f"/Users/markusyoussef/Desktop/git/supplements/data/networks/PPI/{organism}.txt")

5990 112911 Saccharomyces_cerevisiae_S288c
2728 4792 Rattus_norvegicus
1261 2079 Escherichia_coli_K12_MG1655
17829 370709 Homo_sapiens
7561 23196 Mus_musculus
913 1296 Candida_albicans_SC5314
8882 52238 Drosophila_melanogaster
10348 48459 Arabidopsis_thaliana_Columbia
1218 2499 Plasmodium_falciparum_3D7
5336 19404 Caenorhabditis_elegans
3412 12234 Schizosaccharomyces_pombe_972h
2044 12800 Escherichia_coli_K12_W3110
951 1042 Xenopus_laevis


In [None]:
yeast_organisms = {'Candida_albicans_SC5314',
                   'Schizosaccharomyces_pombe', 
                   'Saccharomyces_cerevisiae'}