In [1]:
import pandas as pd

## Preparing the dataframes

### 1. Homology consensus

In [2]:
# Load the homology_consensus dataframe

hc_df = pd.read_csv("homology_consensus.mjson", 
                                 names = ["Code", "Homology", "Regions", "Content_Fraction", "Content_Count"],
                                 on_bad_lines = 'skip')
hc_df.head()

Unnamed: 0,Code,Homology,Regions,Content_Fraction,Content_Count
0,"{""acc"": ""A0A009HC73""","""homology-conformational_diversity-merge"": {""...",308]],"""content_fraction"": 1.0","""content_count"": 308}}"
1,"{""acc"": ""A0A009I561""","""homology-conformational_diversity-merge"": {""...",308]],"""content_fraction"": 1.0","""content_count"": 308}}"
2,"{""acc"": ""A0A009IUT7""","""homology-conformational_diversity-merge"": {""...",308]],"""content_fraction"": 1.0","""content_count"": 308}}"
3,"{""acc"": ""A0A009KQW8""","""homology-conformational_diversity-merge"": {""...",304]],"""content_fraction"": 0.987","""content_count"": 304}}"
4,"{""acc"": ""A0A009MPB7""","""homology-conformational_diversity-merge"": {""...",308]],"""content_fraction"": 1.0","""content_count"": 308}}"


In [3]:
# Combine two accidentally splitted columns into one

hc_df['Regions'] = hc_df['Homology'] + hc_df['Regions']
hc_df.drop(columns = ['Homology', 'Content_Fraction', 'Content_Count'], inplace = True)
hc_df.head()

Unnamed: 0,Code,Regions
0,"{""acc"": ""A0A009HC73""","""homology-conformational_diversity-merge"": {""..."
1,"{""acc"": ""A0A009I561""","""homology-conformational_diversity-merge"": {""..."
2,"{""acc"": ""A0A009IUT7""","""homology-conformational_diversity-merge"": {""..."
3,"{""acc"": ""A0A009KQW8""","""homology-conformational_diversity-merge"": {""..."
4,"{""acc"": ""A0A009MPB7""","""homology-conformational_diversity-merge"": {""..."


In [4]:
# Remove the unnecessary information step by step
# Extract the access codes

hc_df[['_', 'acc']] = hc_df.Code.str.split(":", expand = True)
hc_df.drop(columns = ['_', 'Code'], inplace = True)
hc_df.head()

Unnamed: 0,Regions,acc
0,"""homology-conformational_diversity-merge"": {""...","""A0A009HC73"""
1,"""homology-conformational_diversity-merge"": {""...","""A0A009I561"""
2,"""homology-conformational_diversity-merge"": {""...","""A0A009IUT7"""
3,"""homology-conformational_diversity-merge"": {""...","""A0A009KQW8"""
4,"""homology-conformational_diversity-merge"": {""...","""A0A009MPB7"""


In [5]:
# Extract the regions from the column

hc_df[['Homology', 'Regions']] = hc_df.Regions.str.split(': {"regions":', expand = True)
hc_df.head()

Unnamed: 0,Regions,acc,Homology
0,[[1 308]],"""A0A009HC73""","""homology-conformational_diversity-merge"""
1,[[1 308]],"""A0A009I561""","""homology-conformational_diversity-merge"""
2,[[1 308]],"""A0A009IUT7""","""homology-conformational_diversity-merge"""
3,[[1 304]],"""A0A009KQW8""","""homology-conformational_diversity-merge"""
4,[[1 308]],"""A0A009MPB7""","""homology-conformational_diversity-merge"""


In [6]:
# Split Regions to start and end positions

hc_df[['start', 'end']] = hc_df.Regions.str.split(expand = True)
hc_df.drop(columns = ['Regions'], inplace = True)
hc_df.head()

Unnamed: 0,acc,Homology,start,end
0,"""A0A009HC73""","""homology-conformational_diversity-merge""",[[1,308]]
1,"""A0A009I561""","""homology-conformational_diversity-merge""",[[1,308]]
2,"""A0A009IUT7""","""homology-conformational_diversity-merge""",[[1,308]]
3,"""A0A009KQW8""","""homology-conformational_diversity-merge""",[[1,304]]
4,"""A0A009MPB7""","""homology-conformational_diversity-merge""",[[1,308]]


In [7]:
# Split Homology to Evidence, Feature and Source columns

hc_df[['evidence', 'feature', 'source']] = hc_df.Homology.str.split('-', expand = True)
hc_df.drop(columns = ['Homology'], inplace = True)
hc_df.head()

Unnamed: 0,acc,start,end,evidence,feature,source
0,"""A0A009HC73""",[[1,308]],"""homology",conformational_diversity,"merge"""
1,"""A0A009I561""",[[1,308]],"""homology",conformational_diversity,"merge"""
2,"""A0A009IUT7""",[[1,308]],"""homology",conformational_diversity,"merge"""
3,"""A0A009KQW8""",[[1,304]],"""homology",conformational_diversity,"merge"""
4,"""A0A009MPB7""",[[1,308]],"""homology",conformational_diversity,"merge"""


In [8]:
# Dataframe preprocessing

hc_df['acc'] = hc_df['acc'].apply(lambda x: x.replace('"','').replace('',''))
hc_df['start'] = hc_df['start'].apply(lambda x: x.replace('[[','').replace('',''))
hc_df['end'] = hc_df['end'].apply(lambda x: x.replace(']]','').replace('',''))
hc_df['evidence']  = hc_df['evidence'].apply(lambda x: x.replace('"','').replace('',''))
hc_df['source'] = hc_df['source'].apply(lambda x: x.replace('"','').replace('',''))
hc_df.head()

Unnamed: 0,acc,start,end,evidence,feature,source
0,A0A009HC73,1,308,homology,conformational_diversity,merge
1,A0A009I561,1,308,homology,conformational_diversity,merge
2,A0A009IUT7,1,308,homology,conformational_diversity,merge
3,A0A009KQW8,1,304,homology,conformational_diversity,merge
4,A0A009MPB7,1,308,homology,conformational_diversity,merge


In [9]:
# Choose only disordered regions

hc_dis = hc_df[hc_df['feature'] == 'disorder']
hc_dis

Unnamed: 0,acc,start,end,evidence,feature,source
121169,A0A010QT17,117,130,homology,disorder,merge
121170,A0A010R0B5,5,701,homology,disorder,merge
121171,A0A010RTY3,225,235,homology,disorder,merge
121172,A0A015L8C2,320,333,homology,disorder,merge
121173,A0A015NFB9,153,170,homology,disorder,merge
...,...,...,...,...,...,...
325800,X8JV07,17,38,homology,disorder,merge
325801,Z4YJH1,75,163,homology,disorder,merge
325802,Z4YJJ5,840,856,homology,disorder,merge
325803,Z4YKX0,359,373,homology,disorder,merge


### 2. Homology

In [10]:
# Homology dataframe

hom = pd.read_csv("homology.mjson", 
                  names = ["Code", "Homology", "Regions", "Content_Fraction", "Content_Count", "Regions_IDs"], 
                  on_bad_lines = 'skip')
hom.head()

Unnamed: 0,Code,Homology,Regions,Content_Fraction,Content_Count,Regions_IDs
0,"{""acc"": ""A0A009HC73""","""homology-conformational_diversity-codnas"": {...",308]],"""content_fraction"": 1.0","""content_count"": 308","""regions_ids"": [""B2I1J3(5D8D_A)""]}}"
1,"{""acc"": ""A0A009I561""","""homology-conformational_diversity-codnas"": {...",308]],"""content_fraction"": 1.0","""content_count"": 308","""regions_ids"": [""B2I1J3(5D8D_A)""]}}"
2,"{""acc"": ""A0A009IUT7""","""homology-conformational_diversity-codnas"": {...",308]],"""content_fraction"": 1.0","""content_count"": 308","""regions_ids"": [""B2I1J3(5D8D_A)""]}}"
3,"{""acc"": ""A0A009KQW8""","""homology-conformational_diversity-codnas"": {...",304]],"""content_fraction"": 0.987","""content_count"": 304","""regions_ids"": [""B2I1J3(5D8D_A)""]}}"
4,"{""acc"": ""A0A009MPB7""","""homology-conformational_diversity-codnas"": {...",308]],"""content_fraction"": 1.0","""content_count"": 308","""regions_ids"": [""B2I1J3(5D8D_A)""]}}"


In [11]:
hom['Regions'] = hom['Homology'] + hom['Regions']
hom.drop(columns = ['Homology', 'Content_Fraction', 'Content_Count', 'Regions_IDs'], inplace = True)
hom.head()

Unnamed: 0,Code,Regions
0,"{""acc"": ""A0A009HC73""","""homology-conformational_diversity-codnas"": {..."
1,"{""acc"": ""A0A009I561""","""homology-conformational_diversity-codnas"": {..."
2,"{""acc"": ""A0A009IUT7""","""homology-conformational_diversity-codnas"": {..."
3,"{""acc"": ""A0A009KQW8""","""homology-conformational_diversity-codnas"": {..."
4,"{""acc"": ""A0A009MPB7""","""homology-conformational_diversity-codnas"": {..."


In [12]:
hom[['_', 'acc']] = hom.Code.str.split(":", expand = True)
hom.drop(columns = ['_', 'Code'], inplace = True)
hom.head()

Unnamed: 0,Regions,acc
0,"""homology-conformational_diversity-codnas"": {...","""A0A009HC73"""
1,"""homology-conformational_diversity-codnas"": {...","""A0A009I561"""
2,"""homology-conformational_diversity-codnas"": {...","""A0A009IUT7"""
3,"""homology-conformational_diversity-codnas"": {...","""A0A009KQW8"""
4,"""homology-conformational_diversity-codnas"": {...","""A0A009MPB7"""


In [13]:
hom[['Homology', 'Regions']] = hom.Regions.str.split(': {"regions":', expand = True)
hom.head()

Unnamed: 0,Regions,acc,Homology
0,[[1 308]],"""A0A009HC73""","""homology-conformational_diversity-codnas"""
1,[[1 308]],"""A0A009I561""","""homology-conformational_diversity-codnas"""
2,[[1 308]],"""A0A009IUT7""","""homology-conformational_diversity-codnas"""
3,[[1 304]],"""A0A009KQW8""","""homology-conformational_diversity-codnas"""
4,[[1 308]],"""A0A009MPB7""","""homology-conformational_diversity-codnas"""


In [14]:
hom[['start', 'end']] = hom.Regions.str.split(expand = True)
hom.drop(columns = ['Regions'], inplace = True)
hom.head()

Unnamed: 0,acc,Homology,start,end
0,"""A0A009HC73""","""homology-conformational_diversity-codnas""",[[1,308]]
1,"""A0A009I561""","""homology-conformational_diversity-codnas""",[[1,308]]
2,"""A0A009IUT7""","""homology-conformational_diversity-codnas""",[[1,308]]
3,"""A0A009KQW8""","""homology-conformational_diversity-codnas""",[[1,304]]
4,"""A0A009MPB7""","""homology-conformational_diversity-codnas""",[[1,308]]


In [15]:
hom[['evidence', 'feature', 'source']] = hom.Homology.str.split('-', expand = True)
hom.drop(columns = ['Homology'], inplace = True)
hom.head()

Unnamed: 0,acc,start,end,evidence,feature,source
0,"""A0A009HC73""",[[1,308]],"""homology",conformational_diversity,"codnas"""
1,"""A0A009I561""",[[1,308]],"""homology",conformational_diversity,"codnas"""
2,"""A0A009IUT7""",[[1,308]],"""homology",conformational_diversity,"codnas"""
3,"""A0A009KQW8""",[[1,304]],"""homology",conformational_diversity,"codnas"""
4,"""A0A009MPB7""",[[1,308]],"""homology",conformational_diversity,"codnas"""


In [16]:
hom['acc'] = hom['acc'].apply(lambda x: x.replace('"','').replace('',''))
hom['start'] = hom['start'].apply(lambda x: x.replace('[[','').replace('',''))
hom['end'] = hom['end'].apply(lambda x: x.replace(']]','').replace('',''))
hom['evidence']  = hom['evidence'].apply(lambda x: x.replace('"','').replace('',''))
hom['source'] = hom['source'].apply(lambda x: x.replace('"','').replace('',''))
hom.head()

Unnamed: 0,acc,start,end,evidence,feature,source
0,A0A009HC73,1,308,homology,conformational_diversity,codnas
1,A0A009I561,1,308,homology,conformational_diversity,codnas
2,A0A009IUT7,1,308,homology,conformational_diversity,codnas
3,A0A009KQW8,1,304,homology,conformational_diversity,codnas
4,A0A009MPB7,1,308,homology,conformational_diversity,codnas


In [17]:
hom_dis = hom[hom['feature'] == 'disorder']
hom_dis

Unnamed: 0,acc,start,end,evidence,feature,source
10,A0A010QT17,117,130,homology,disorder,disprot
12,A0A010R0B5,5,701,homology,disorder,disprot
15,A0A010RTY3,225,235,homology,disorder,disprot
31,A0A015L8C2,320,333,homology,disorder,disprot
33,A0A015NFB9,153,170,homology,disorder,disprot
...,...,...,...,...,...,...
623238,Z4YJH1,75,163,homology,disorder,ideal
623240,Z4YJJ5,840,856,homology,disorder,disprot
623243,Z4YKX0,359,373,homology,disorder,disprot
623245,Z4YNI2,366,388,homology,disorder,disprot


### 3. Homology conservation

In [68]:
hcons = pd.read_csv("homology_conservation.mjson", 
                                 #names = ["Code", "Homology", "Regions", "Content_Fraction", "Content_Count"],
                                 on_bad_lines = 'skip')
hcons.head()

  hcons = pd.read_csv("homology_conservation.mjson",


Unnamed: 0,"{""acc"": ""Q9Y6K1""","""homology-msa_information_content-psiblast"": {""scores"": [4.322663625340791",4.27294936775308,4.227139107017728,4.161343719214743,4.171520449077236,4.117666639132671,4.087438739916795,4.052685731012853,4.052122081893202,...,0.608.1,0.601,0.521.6,0.492.1,0.408.1,0.327,0.303,0.175,0.07]},"""msa_consensus"": ""MPAMPSSGPVDTSSSSPKREDDRKEGEESEEPLNKEEVREPSTPARKVGRPGKKRKSPVVKSYSTPQGKGLVVKIPSRVHGSEPSQALPNGDLESSSGNSSEEGSEESVLKNGAPSDSESSSPPLPEASRSVENGSSTPEEGLGKPSEKSEESSDSSEDSTKSEGSEGRLRGGSGWESSLRQRPSPRSTFQAGDPYEVSKRKREEDLASWKREAEKKAKSSSSSNEVEESSSSGSSQKSEEASDPSVSQPTDPASPTVATTPEPVGSESLDKNSPKKAVDECEYEDGRGFGVGDLVWGKIKGFPWWPGRIVSWSDASRRRAAEGTRWVAWFGDGKFSWVDPSKLKPFSEFFKAFSKQT-KKSGSYRKAIYEALEEASRRAGKGFPC-C----SDPGKSLEYQNKDMLDWALGGFQPSGPKGLKPPEEEPNPYKEVVTESAVEPEAADYLPPPPAKKKRKNTAKKPKVKEIIDERTRERLVYEVRQKKRNIEDICLSCGSQNVVLEHPLFEGGLCKKCKDFYLEGAYSYDDDGYQSYCTWCCEGGELLCCDN-NCCRCFCKKCILVLVGRGAAAPIEEEEPWLCYMCNPKRL-GLLRRRDDWKSRLQSFFVNNVAQEFSPPKVYPP-PAE-RRPIRVLSLFDGIGGGRLALERLGI-VECVFASEIDEDAIRVYRANHGGNIIYVGDIRKITAKDIPEWGPFDLLIGGSPCQDLSIAGPARKGLEDGRGRLFFEFYRILKELRPKE----P-FFLFENVAGMPSHDKGDISRFLEVEPVVLDAKDFGPQHRARLFWGNLPGMNRPLPAPLPDKLELQDCLETGRTAKVEKVRTITTRSNSIPQGKDQLFPVDMNGKPRLLWPRECERLQGFPDHYTDVGNVSRTQRYKLLGNSWSVPVIRHLLSPLKDYFACV""}"
0,"{""acc"": ""S7W634""","""homology-msa_information_content-psiblast"": ...",4.307196,4.290296,4.291665,4.284301,4.281782,4.264646,4.265654,4.25374,...,,,,,,,,,,
1,"{""acc"": ""S6B291""","""homology-msa_information_content-psiblast"": ...",3.193872,3.133234,3.074398,3.142933,2.965978,3.162166,3.176524,3.274977,...,,,,,,,,,,
2,"{""acc"": ""Q9Z2F5""","""homology-msa_information_content-psiblast"": ...",3.800589,3.76465,3.711119,3.88061,3.667926,3.645686,3.623107,3.415641,...,,,,,,,,,,
3,"{""acc"": ""A0A024RAV5""","""homology-msa_information_content-psiblast"": ...",3.031827,3.462032,3.708812,4.020194,3.372382,3.652497,3.550122,3.490848,...,,,,,,,,,,
4,"{""acc"": ""A1L1Q4""","""homology-msa_information_content-psiblast"": ...",4.172194,4.080723,4.025939,3.971284,3.922919,3.873147,3.846846,3.844121,...,,,,,,,,,,


In [69]:
hcons = hcons.iloc[:, :2]
hcons.columns = ['Code', 'Regions']
hcons.head()

Unnamed: 0,Code,Regions
0,"{""acc"": ""S7W634""","""homology-msa_information_content-psiblast"": ..."
1,"{""acc"": ""S6B291""","""homology-msa_information_content-psiblast"": ..."
2,"{""acc"": ""Q9Z2F5""","""homology-msa_information_content-psiblast"": ..."
3,"{""acc"": ""A0A024RAV5""","""homology-msa_information_content-psiblast"": ..."
4,"{""acc"": ""A1L1Q4""","""homology-msa_information_content-psiblast"": ..."


In [70]:
hcons['Regions'].unique()

array([' "homology-msa_information_content-psiblast": {"scores": [4.309477168321852',
       ' "homology-msa_information_content-psiblast": {"scores": [3.9663497468939157',
       ' "homology-msa_information_content-psiblast": {"scores": [4.050572662407841',
       ...,
       ' "homology-msa_information_content-psiblast": {"scores": [4.216868789169694',
       ' "homology-msa_information_content-psiblast": {"scores": [4.211578075823008',
       ' "homology-msa_information_content-psiblast": {"scores": [3.695790536135261'],
      dtype=object)

In [60]:
hcons[['_', 'acc']] = hcons.Code.str.split(":", expand = True)
hcons.drop(columns = ['_', 'Code'], inplace = True)
hcons.head()

Unnamed: 0,Regions,acc
0,"""homology-msa_information_content-psiblast"": ...","""S7W634"""
1,"""homology-msa_information_content-psiblast"": ...","""S6B291"""
2,"""homology-msa_information_content-psiblast"": ...","""Q9Z2F5"""
3,"""homology-msa_information_content-psiblast"": ...","""A0A024RAV5"""
4,"""homology-msa_information_content-psiblast"": ...","""A1L1Q4"""


In [61]:
hcons[['Homology', 'Regions']] = hcons.Regions.str.split(': {"scores":', expand = True)
hcons.drop(columns = ['Regions'], inplace = True)
hcons.head()

Unnamed: 0,acc,Homology
0,"""S7W634""","""homology-msa_information_content-psiblast"""
1,"""S6B291""","""homology-msa_information_content-psiblast"""
2,"""Q9Z2F5""","""homology-msa_information_content-psiblast"""
3,"""A0A024RAV5""","""homology-msa_information_content-psiblast"""
4,"""A1L1Q4""","""homology-msa_information_content-psiblast"""


In [62]:
hcons[['evidence', 'feature', 'source']] = hcons.Homology.str.split('-', expand = True)
hcons.drop(columns = ['Homology'], inplace = True)
hcons.head()

Unnamed: 0,acc,evidence,feature,source
0,"""S7W634""","""homology",msa_information_content,"psiblast"""
1,"""S6B291""","""homology",msa_information_content,"psiblast"""
2,"""Q9Z2F5""","""homology",msa_information_content,"psiblast"""
3,"""A0A024RAV5""","""homology",msa_information_content,"psiblast"""
4,"""A1L1Q4""","""homology",msa_information_content,"psiblast"""


In [63]:
hcons['acc'] = hcons['acc'].apply(lambda x: x.replace('"','').replace('',''))
hcons['evidence']  = hcons['evidence'].apply(lambda x: x.replace('"','').replace('',''))
hcons['source'] = hcons['source'].apply(lambda x: x.replace('"','').replace('',''))
hcons.head()

Unnamed: 0,acc,evidence,feature,source
0,S7W634,homology,msa_information_content,psiblast
1,S6B291,homology,msa_information_content,psiblast
2,Q9Z2F5,homology,msa_information_content,psiblast
3,A0A024RAV5,homology,msa_information_content,psiblast
4,A1L1Q4,homology,msa_information_content,psiblast


In [64]:
hcons_dis = hcons[hcons['feature'] == 'disorder']
hcons_dis

Unnamed: 0,acc,evidence,feature,source
