## load_read_orthoMCLresults.

This notebook reads the output of orthoMCL ortholog files and outputs the results in a format compatabile with downstream processing for L. starkeyi GSM. 

orthoMCL datafile generated from *S. cerevisiae* 288C, *R. toruloides* IFO0880, *Y. lipolytica* CLIB122,*L. starkeyi* NRRL-11557.


orthoMCL files generated Aug 5th, 2023 by Yichao Han. 

import needed libraries. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
# load orthoMCL datafile generated from *S. cerevisiae* 288C, *R. toruloides* IFO0880, *Y. lipolytica* CLIB122,*L. starkeyi* NRRL-11557.
ortho = pd.read_table('orthologs.txt',header=None)

In [3]:
ortho.columns = (['org1','org2','score'])
ortho.head(10)

Unnamed: 0,org1,org2,score
0,Lst|G1IA6-16388-MONOMER,Rto|14467,0.347
1,Lst|G1IA6-16390-MONOMER,Rto|10674,0.116
2,Lst|G1IA6-16391-MONOMER,Rto|11380,1.834
3,Lst|G1IA6-16399-MONOMER,Rto|13691,0.585
4,Lst|G1IA6-16404-MONOMER,Rto|16833,1.944
5,Lst|G1IA6-16405-MONOMER,Rto|14225,0.883
6,Lst|G1IA6-16407-MONOMER,Rto|11477,0.284
7,Lst|G1IA6-16408-MONOMER,Rto|16279,0.671
8,Lst|G1IA6-16409-MONOMER,Rto|15873,0.345
9,Lst|G1IA6-16418-MONOMER,Rto|10947,1.944


In [4]:
ortho.shape

(20685, 3)

examine the data structure for one protein. 

In [5]:
ortho[ortho.org1=='Lst|G1IA6-19666-MONOMER']

Unnamed: 0,org1,org2,score
5000,Lst|G1IA6-19666-MONOMER,Sce|YPR172W,0.242
8847,Lst|G1IA6-19666-MONOMER,Yli|G1FV3-24726-MONOMER,0.29


In [6]:
ortho[ortho.org2=='Sce|YPR172W']

Unnamed: 0,org1,org2,score
5000,Lst|G1IA6-19666-MONOMER,Sce|YPR172W,0.242


In [7]:
len(ortho[ortho.org1=='Lst|G1IA6-19666-MONOMER'].org2)

2

In [8]:
temp = set(ortho.org1.to_list())
Lst_genes = [x for x in temp if "Lst" in x]

In [9]:
temp = set(ortho.org1.to_list())
Sce_genes = [x for x in temp if "Sce" in x]

In [10]:
temp = set(ortho.org1.to_list())
Rto_genes = [x for x in temp if "Rto" in x]

In [12]:
# dataframe for storing orthologs to *L. starkeyi*. 
df = pd.DataFrame(index = Lst_genes,columns=['org1','org2','org3','org4','s2','s3','s4'])

# iterate through the *L. starkeyi* genes. 
for i in Lst_genes:
    
    # obtain the orthologs. 
    res = ortho[ortho.org1==i].org2
    score = ortho[ortho.org1==i].score
    
    # if all three species have orthologs with *L. starkeyi*. 
    if len(res)==3:
        df.loc[i,'org2']=ortho[ortho.org1==i].org2.iloc[0]
        df.loc[i,'org3']=ortho[ortho.org1==i].org2.iloc[1]
        df.loc[i,'org4']=ortho[ortho.org1==i].org2.iloc[2]
        df.loc[i,'s2']=ortho[ortho.org1==i].score.iloc[0]
        df.loc[i,'s3']=ortho[ortho.org1==i].score.iloc[1]
        df.loc[i,'s4']=ortho[ortho.org1==i].score.iloc[2]
    
    # store orthologs in proper column. 
    else:
        for v,e in enumerate(res):
            if "Rto" in e:
                df.loc[i,'org2']=e
                df.loc[i,'s2']=score.iloc[v]
            elif "Sce" in e:
                df.loc[i,'org3']=e
                df.loc[i,'s3']=score.iloc[v]                
            elif "Yli" in e:
                df.loc[i,'org4']=e
                df.loc[i,'s4']=score.iloc[v]
            

In [13]:
df.org2.count()


3337

In [14]:
df.org3.count()


3165

In [15]:
df.org4.count()


3855

In [16]:
df.head()

Unnamed: 0,org1,org2,org3,org4,s2,s3,s4
Lst|G1IA6-18760-MONOMER,,Rto|11672,,Yli|G1FV3-29249-MONOMER,0.21,,0.3
Lst|G1IA6-24286-MONOMER,,Rto|12455,,,0.52,,
Lst|G1IA6-22710-MONOMER,,Rto|12880,,,0.668,,
Lst|G1IA6-16893-MONOMER,,Rto|11787,Sce|YOR360C,Yli|G1FV3-30746-MONOMER,0.588,0.409,0.661
Lst|G1IA6-18340-MONOMER,,Rto|12139,Sce|YLR075W,Yli|G1FV3-27651-MONOMER,1.297,1.273,1.148


# next steps.

map into gene name to match GSM. . 

In [17]:
# read in the protein mapper for L. starkeyi NRRL 11557 (maps the JGI gene ID to the CycID). 
Lst_to_norm = pd.read_table('../blastp/lipomyces-protein-map.tab')
Lst_to_norm.head()

Unnamed: 0,LocusTag,CycId
0,Lipst1_1_2452,G1IA6-18788-MONOMER
1,Lipst1_1_66171,G1IA6-23492-MONOMER
2,Lipst1_1_72639,G1IA6-21346-MONOMER
3,Lipst1_1_45489,G1IA6-19613-MONOMER
4,Lipst1_1_2522,G1IA6-18841-MONOMER


In [18]:
# create a mapping dictionary. 
Lst_to_norm_dict = dict(zip(Lst_to_norm.CycId,Lst_to_norm.LocusTag))

In [19]:
Lst_to_norm_dict

{'G1IA6-18788-MONOMER': 'Lipst1_1_2452',
 'G1IA6-23492-MONOMER': 'Lipst1_1_66171',
 'G1IA6-21346-MONOMER': 'Lipst1_1_72639',
 'G1IA6-19613-MONOMER': 'Lipst1_1_45489',
 'G1IA6-18841-MONOMER': 'Lipst1_1_2522',
 'G1IA6-19896-MONOMER': 'Lipst1_1_111440',
 'G1IA6-23493-MONOMER': 'Lipst1_1_6619',
 'G1IA6-19897-MONOMER': 'Lipst1_1_117454',
 'G1IA6-23494-MONOMER': 'Lipst1_1_66199',
 'G1IA6-19898-MONOMER': 'Lipst1_1_117465',
 'G1IA6-23495-MONOMER': 'Lipst1_1_66204',
 'G1IA6-20563-MONOMER': 'Lipst1_1_306050',
 'G1IA6-19311-MONOMER': 'Lipst1_1_260711',
 'G1IA6-19899-MONOMER': 'Lipst1_1_117466',
 'G1IA6-23496-MONOMER': 'Lipst1_1_6621',
 'G1IA6-21347-MONOMER': 'Lipst1_1_72643',
 'G1IA6-19825-MONOMER': 'Lipst1_1_70995',
 'G1IA6-19900-MONOMER': 'Lipst1_1_117584',
 'G1IA6-18843-MONOMER': 'Lipst1_1_2524',
 'G1IA6-23497-MONOMER': 'Lipst1_1_6623',
 'G1IA6-19901-MONOMER': 'Lipst1_1_117633',
 'G1IA6-22250-MONOMER': 'Lipst1_1_74019',
 'G1IA6-23498-MONOMER': 'Lipst1_1_75312',
 'G1IA6-19902-MONOMER': 'Lipst1_

In [20]:
# create a column to map to the lipomyces names (rather than the index).
df.org1=df.index

# replace the organism ID to match the dictionary annotations. 
temp = df.org1.to_list()
df.org1 = [x.replace('Lst|','') for x in temp]


# perform mapping. 
df.org1=df['org1'].map(Lst_to_norm_dict)

In [21]:
# rename columns to keep track of organisms. 
df=df.rename(columns=({'org1':'Lst','org2':'Rto','org3':'Sce','org4':'Yli'}))

In [22]:
# replace rest of organism IDs in the dataframe.

# rto.
temp = df.Rto.to_list()
df.Rto = [x.replace('Rto|','') if x is not np.nan else x for x in temp]

# sce.
temp = df.Sce.to_list()
df.Sce = [x.replace('Sce|','') if x is not np.nan else x for x in temp]

# yli.
temp = df.Yli.to_list()
df.Yli = [x.replace('Yli|','') if x is not np.nan else x for x in temp]

In [23]:
df.head()

Unnamed: 0,Lst,Rto,Sce,Yli,s2,s3,s4
Lst|G1IA6-18760-MONOMER,Lipst1_1_2416,11672,,G1FV3-29249-MONOMER,0.21,,0.3
Lst|G1IA6-24286-MONOMER,Lipst1_1_76503,12455,,,0.52,,
Lst|G1IA6-22710-MONOMER,Lipst1_1_170805,12880,,,0.668,,
Lst|G1IA6-16893-MONOMER,Lipst1_1_60832,11787,YOR360C,G1FV3-30746-MONOMER,0.588,0.409,0.661
Lst|G1IA6-18340-MONOMER,Lipst1_1_49857,12139,YLR075W,G1FV3-27651-MONOMER,1.297,1.273,1.148


In [24]:
df

Unnamed: 0,Lst,Rto,Sce,Yli,s2,s3,s4
Lst|G1IA6-18760-MONOMER,Lipst1_1_2416,11672,,G1FV3-29249-MONOMER,0.21,,0.3
Lst|G1IA6-24286-MONOMER,Lipst1_1_76503,12455,,,0.52,,
Lst|G1IA6-22710-MONOMER,Lipst1_1_170805,12880,,,0.668,,
Lst|G1IA6-16893-MONOMER,Lipst1_1_60832,11787,YOR360C,G1FV3-30746-MONOMER,0.588,0.409,0.661
Lst|G1IA6-18340-MONOMER,Lipst1_1_49857,12139,YLR075W,G1FV3-27651-MONOMER,1.297,1.273,1.148
...,...,...,...,...,...,...,...
Lst|G1IA6-20465-MONOMER,Lipst1_1_111862,15065,,G1FV3-24801-MONOMER,1.552,,1.371
Lst|G1IA6-17848-MONOMER,Lipst1_1_78803,,YGL166W,,,0.205,
Lst|G1IA6-19128-MONOMER,Lipst1_1_69885,,YER051W,G1FV3-25382-MONOMER,,0.797,0.983
Lst|G1IA6-21884-MONOMER,Lipst1_1_106496,,,G1FV3-30933-MONOMER,,,0.509


In [25]:
df.Lst.count()

4398

In [26]:
df.Rto.count()

3337

In [27]:
df.Sce.count()

3165

In [28]:
df.Yli.count()

3855

In [29]:
# change the Yli mappings.
# read in the protein mapper for Lipomyces. 
ylip_to_norm = pd.read_table('../blastp/yarrowia-protein-map.tab')
ylip_to_norm.head()
ylip_to_norm_dict = dict(zip(ylip_to_norm.CycId,ylip_to_norm.LocusTag))

In [30]:
# create a column to map to the lipolytica names (rather than the inplace).
temp_series = df.Yli


# perform mapping. 
df.Yli=df['Yli'].map(ylip_to_norm_dict)

In [31]:
df.Yli.count()

3853

In [205]:
# save file. 
df.to_csv('orthoMCL_orthologs_2023.csv')