In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import cluster
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [14]:
df = pd.read_csv("EF_vs_GenomesDB.blast", sep = "\t", names = ['query', 'subject',
           'pc_identity', 'aln_length', 'mismatches', 'gaps_opened',
           'query_start', 'query_end', 'subject_start', 'subject_end',
           'e_value', 'bitscore'], engine = 'python')


## Utilizamos la familia de enzimas (EF) como query sobre la base de datos de los genomas.

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   query          423 non-null    object 
 1   subject        423 non-null    object 
 2   pc_identity    423 non-null    float64
 3   aln_length     423 non-null    int64  
 4   mismatches     423 non-null    int64  
 5   gaps_opened    423 non-null    int64  
 6   query_start    423 non-null    int64  
 7   query_end      423 non-null    int64  
 8   subject_start  423 non-null    int64  
 9   subject_end    423 non-null    int64  
 10  e_value        423 non-null    float64
 11  bitscore       423 non-null    float64
dtypes: float64(3), int64(7), object(2)
memory usage: 39.8+ KB


In [16]:
df.head()

Unnamed: 0,query,subject,pc_identity,aln_length,mismatches,gaps_opened,query_start,query_end,subject_start,subject_end,e_value,bitscore
0,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|6666666.146852.1090|6666666.146852|NC_00293...,79.623,530,108,0,1,530,1,530,0.0,847.0
1,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|749927.13.1727|749927.13|NC_014318.1|D3phos...,59.586,532,213,2,1,530,1,532,0.0,620.0
2,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|1156913.7.6305|1156913.7|NC_021252.1|D3phos...,59.203,527,214,1,1,526,1,527,0.0,613.0
3,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|6666666.104540.4265|6666666.104540|AOHO01|D...,59.203,527,214,1,1,526,1,527,0.0,612.0
4,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|446462.15.6083|446462.15|NC_021252.1|D3phos...,60.338,532,209,2,1,530,1,532,0.0,595.0


In [17]:
df.describe()

Unnamed: 0,pc_identity,aln_length,mismatches,gaps_opened,query_start,query_end,subject_start,subject_end,e_value,bitscore
count,423.0,423.0,423.0,423.0,423.0,423.0,423.0,423.0,423.0,423.0
mean,39.974988,293.378251,156.380615,4.20331,51.527187,337.513002,53.867612,341.193853,5.751065e-11,205.352246
std,12.111629,97.748995,37.400729,2.656707,45.552621,69.829679,51.258211,75.681174,1.044102e-09,181.943323
min,24.409,92.0,40.0,0.0,1.0,279.0,1.0,260.0,0.0,57.0
25%,32.1285,238.0,135.5,2.0,7.0,304.5,10.0,300.0,1.745e-49,92.05
50%,35.196,265.0,154.0,4.0,49.0,313.0,48.0,314.0,2.56e-28,115.0
75%,41.506,324.5,176.5,6.0,74.0,325.0,75.5,346.0,2.5949999999999998e-20,174.5
max,81.6,532.0,280.0,13.0,195.0,530.0,302.0,662.0,2.14e-08,847.0


# Obtener las familias expandidas:

## Los genomas que aparecen en la columna "subject" son los que forman las familias expandidas. 

In [18]:
df.head()

Unnamed: 0,query,subject,pc_identity,aln_length,mismatches,gaps_opened,query_start,query_end,subject_start,subject_end,e_value,bitscore
0,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|6666666.146852.1090|6666666.146852|NC_00293...,79.623,530,108,0,1,530,1,530,0.0,847.0
1,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|749927.13.1727|749927.13|NC_014318.1|D3phos...,59.586,532,213,2,1,530,1,532,0.0,620.0
2,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|1156913.7.6305|1156913.7|NC_021252.1|D3phos...,59.203,527,214,1,1,526,1,527,0.0,613.0
3,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|6666666.104540.4265|6666666.104540|AOHO01|D...,59.203,527,214,1,1,526,1,527,0.0,612.0
4,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,gi|446462.15.6083|446462.15|NC_021252.1|D3phos...,60.338,532,209,2,1,530,1,532,0.0,595.0


## La información descrita en un query es como sigue:
## ID | familia | función_enzima dentro de la familia | genoma

## La familia 1 tiene 4 enzimas numeradas de 1 al 4, mientras que la familia 2 consta de 3 enzimas númeradas de 1 al 3.

In [19]:
df["query"][0]

'3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogenase_1|Cglu'

## La información descrita en un subject es:
## GenInfo (gi) | Num de identificación del genoma + num de identificación del gen | Num de identificación del genoma | reference sequence | Organismo 

In [20]:
df["subject"][0]

'gi|6666666.146852.1090|6666666.146852|NC_002935.2|D3phosphoglycerate_dehydrogenase_EC_1.1.1.95|CorynebacteriumdiphtheriaeNCTC13129s'

In [21]:
df["query"].value_counts()

3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogenase_1|Cglu    93
3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogenase_2|Mtub    93
3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogenase_3|Mtub    93
3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogenase_4|Scoe    93
3PGA_AMINOACIDS|2|Phosphoserine_aminotransferase_1|none    17
3PGA_AMINOACIDS|2|Phosphoserine_aminotransferase_2|none    17
3PGA_AMINOACIDS|2|Phosphoserine_aminotransferase_3|none    17
Name: query, dtype: int64

In [22]:
df["subject"].value_counts()

gi|6666666.146852.1090|6666666.146852|NC_002935.2|D3phosphoglycerate_dehydrogenase_EC_1.1.1.95|CorynebacteriumdiphtheriaeNCTC13129s                         4
gi|6666666.111158.5905|6666666.111158|NC_0131311|Vancomycin_resistance_protein_VanH__Dlactate_dehydrogenase_EC_1.1.1.28|CatenulisporaacidiphilaDSM44928s    4
gi|749927.13.2117|749927.13|NC_014318.1|D3phosphoglycerate_dehydrogenase_EC_1.1.1.95|AmycolatopsismediterraneiU32s                                          4
gi|6666666.112811.5447|6666666.112811|JOEF01|D3phosphoglycerate_dehydrogenase_EC_1.1.1.95|Allokutzneriaalbatas                                              4
gi|649831.6.2685|649831.6|NC_021191.1|D3phosphoglycerate_dehydrogenase_EC_1.1.1.95|ActinoplanesspN902109s                                                   4
                                                                                                                                                           ..
gi|6666666.104540.2193|6666666.104540|AOHO01|Phospho

# Por cada enzima en cada familia vamos a obtener su correspondiente familia expandida .
## Primero vamos a identificar cada enzima según su familia y su número dentro de esa familia.


In [23]:
df["query"][0]

'3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogenase_1|Cglu'

In [27]:
def extract_fam_seed(data):
    fam_seed_list = []
    for x in data["query"][:]:
        x_string = x.split("|")
        x_fam = x_string[1]
        x_enz = x_string[2].split("_")[-1]
        fam_seed_list.append([x_fam,x_enz])
    data["(fam,seed)"] = fam_seed_list
    data = pd.DataFrame.reindex(df, columns = ['query','(fam,seed)', 'subject',
           'pc_identity', 'aln_length', 'mismatches', 'gaps_opened',
           'query_start', 'query_end', 'subject_start', 'subject_end',
           'e_value', 'bitscore'])
    return data

extract_fam_seed(df)
        

Unnamed: 0,query,"(fam,seed)",subject,pc_identity,aln_length,mismatches,gaps_opened,query_start,query_end,subject_start,subject_end,e_value,bitscore
0,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,"[1, 1]",gi|6666666.146852.1090|6666666.146852|NC_00293...,79.623,530,108,0,1,530,1,530,0.000000e+00,847.0
1,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,"[1, 1]",gi|749927.13.1727|749927.13|NC_014318.1|D3phos...,59.586,532,213,2,1,530,1,532,0.000000e+00,620.0
2,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,"[1, 1]",gi|1156913.7.6305|1156913.7|NC_021252.1|D3phos...,59.203,527,214,1,1,526,1,527,0.000000e+00,613.0
3,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,"[1, 1]",gi|6666666.104540.4265|6666666.104540|AOHO01|D...,59.203,527,214,1,1,526,1,527,0.000000e+00,612.0
4,3PGA_AMINOACIDS|1|Phosphoglycerate_dehydrogena...,"[1, 1]",gi|446462.15.6083|446462.15|NC_021252.1|D3phos...,60.338,532,209,2,1,530,1,532,0.000000e+00,595.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,3PGA_AMINOACIDS|2|Phosphoserine_aminotransfera...,"[2, 3]",gi|6666666.146852.774|6666666.146852|NC_002935...,61.326,362,139,1,1,362,11,371,7.620000e-167,471.0
419,3PGA_AMINOACIDS|2|Phosphoserine_aminotransfera...,"[2, 3]",gi|6666666.146872.438|6666666.146872|NC_017803...,60.055,363,145,0,1,363,12,374,1.280000e-166,471.0
420,3PGA_AMINOACIDS|2|Phosphoserine_aminotransfera...,"[2, 3]",gi|6666666.111572.2478|6666666.111572|NC_00948...,64.641,362,126,2,1,362,302,661,1.790000e-158,460.0
421,3PGA_AMINOACIDS|2|Phosphoserine_aminotransfera...,"[2, 3]",gi|367928.21.407|367928.21|NC_008618.1|Phospho...,57.297,370,151,2,1,363,11,380,8.970000e-152,433.0


In [None]:
df.type