### Data

In [1]:
!pip install biopython polars



In [2]:
!mkdir -p /home/onyxia/work/KEGG_Pipeline/data
!wget -nc https://minio.lab.sspcloud.fr/gamer35/KEGG_db/Prok_proteins.parquet -P /home/onyxia/work/KEGG_Pipeline/data

--2024-02-14 02:04:53--  https://minio.lab.sspcloud.fr/gamer35/KEGG_db/Prok_proteins.parquet
Resolving minio.lab.sspcloud.fr (minio.lab.sspcloud.fr)... 192.168.253.152, 192.168.253.151
Connecting to minio.lab.sspcloud.fr (minio.lab.sspcloud.fr)|192.168.253.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3111842428 (2.9G) [binary/octet-stream]
Saving to: ‘/home/onyxia/work/KEGG_Pipeline/data/Prok_proteins.parquet’


2024-02-14 02:05:19 (116 MB/s) - ‘/home/onyxia/work/KEGG_Pipeline/data/Prok_proteins.parquet’ saved [3111842428/3111842428]



In [1]:
import pandas as pd
import polars as pl
from Bio import SeqIO
import polars.selectors as cs

data_path= "/home/onyxia/work/KEGG_Pipeline/data/"

In [9]:
df = pd.read_parquet(data_path + "Prok_proteins.parquet")

df_polars = pl.read_parquet(data_path + "Prok_proteins.parquet")
df_polars = df_polars.unique()

In [11]:
df_testin = df_polars.with_columns(pl.when(cs.string().str.lengths() >= 700)
                  .then(cs.string().str.slice(0, 700))
                  .otherwise(cs.string())
                  .keep_name()
                )
df_testin = df_testin.to_pandas()

  df_testin = df_polars.with_columns(pl.when(cs.string().str.lengths() >= 700)
  .keep_name()


In [12]:
df

Unnamed: 0,gene_id,K0,AA_seq
0,pfi:PFC_00005,K02035,MRKKLVGILTILVALGMLVSPLLKPVAAEDQKVLKIAMYSATGSLF...
1,pfi:PFC_00010,K02033,MGLGKYLLIRAINALIVLSIVVLVVSALFVKVAEKELESRIQEIVN...
2,pfi:PFC_00015,K02034,MRWVDLKESLSDFWFEFRREKTGIAGLALLIFWIVVAVSAPYVTEP...
3,pfi:PFC_00020,K02031,MAKKILEVKNLKMYYFTSRGPVKAVDDVTFDLEKGEVLGLAGESGC...
4,pfi:PFC_00025,K02032,MAEPILKVENLKKYFPVRRGLLGALRGEPQRFVRAVDGVSFEVYKQ...
...,...,...,...
15550880,vta:P0076,K19166,MHVISKRPFNDAKKNFPTCAEALDAAYKVLRISRAKTPEELKALFP...
15550881,vta:P0079,K18320,MTNPAFKWKHFAPEIILWCLRWYGSTPMSYANLSDMLAERGVSVNR...
15550882,vta:P0083,K18320,MTNPAFKWKHFAPEIILWCLRWYGSTPMSYGNLSDMLAERGVSVNR...
15550883,vta:P0094,K03497,MSNKKDFKALMKNKSKSLLDEEINEVKPGESVVLIPKEEIYSSEQV...


Here I should first cut my proteins in 500 or 600 in order to cluster them.

In [13]:

# Function to write a FASTA file from DataFrame
def write_fasta_from_dataframe(dataframe, output_file):
    with open(output_file, 'w') as fasta_file:
        for index, row in dataframe.iterrows():
            fasta_file.write(f'>{row["gene_id"]}\n{row["AA_seq"]}\n')

# Specify the output file name
output_filename = data_path + 'truncated_proteins.fasta'

# Call the function to write the FASTA file
write_fasta_from_dataframe(df_testin, output_filename)

print(f'FASTA file "{output_filename}" has been created.')
#writes a 5go fasta file in 11 minutes

FASTA file "/home/onyxia/work/KEGG_Pipeline/data/truncated_proteins.fasta" has been created.


Now the path to follow
- `conda install -c conda-forge -c bioconda mmseqs2`to install the software
- `mmseqs createdb prok_genes.fasta DB` to create database usable by MMSeq2
- `mmseqs easy-cluster DB DB_clu tmp --min-seq-id 0.4` to cluster the DB into DB_clu, using tmp as temporary folder with 0.4 sequence identity

## Study of the clusters obtained by MMseqs2

The .tsv file given by clustering with MMseqs2 is composed of two columns. The first contains the representative sequence of each cluster (that will thus be repeated across rows of sequences belonging to the same cluster) and the second contains all the sequences of the data set. To calculate the unmber of clusterss made by MMseqs2, you thus have to compute the number of unique values present in the first column. If two sequences are on the same row, they belong to the same cluster.  

In [50]:
clusters = pl.read_csv(data_path + 'clusterRes_cluster.tsv', separator='\t')\
             .with_columns(pl.col('xbc_ELE36_01515').str.replace('_',':'))\
             .with_columns(pl.col('xbc_ELE36_01515_duplicated_0').str.replace('_',':'))


In [51]:
clusters
repres_seqs = clusters.select('xbc_ELE36_01515').unique().get_column('xbc_ELE36_01515').to_list()
print(len(repres_seqs))

583766


In [52]:
clusters

xbc_ELE36_01515,xbc_ELE36_01515_duplicated_0
str,str
"""xbc:ELE36_0151…","""xeu:XSP_003642…"
"""xbc:ELE36_0151…","""xoz:BE73_21430…"
"""xbc:ELE36_0151…","""slm:BIZ42_0786…"
"""xbc:ELE36_0151…","""thes:FHQ07_079…"
"""xbc:ELE36_0151…","""dcs:ISN74_1750…"
"""xbc:ELE36_0151…","""xci:XCAW_04591…"
"""xbc:ELE36_0151…","""srh:BAY15_3414…"
"""xbc:ELE36_0151…","""lus:E5843_1369…"
"""xbc:ELE36_0151…","""lem:LEN_4555"""
"""xbc:ELE36_0151…","""xfs:D934_00475…"


In [53]:
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import json
import numpy as np

df_polars = pl.read_parquet(data_path + "Prok_proteins.parquet")
df_full_info = df_polars.unique()

repres_seqs = clusters.select('xbc_ELE36_01515').unique().get_column('xbc_ELE36_01515').to_list()
print(len(repres_seqs)) #the number of representatitve sequences is the number of clustersdf_full_info = pl.read_parquet(data_path + 'Prok_proteins.parquet')

print("Reading the cluster .tsv file and creating train, validation and test from it")

repres_seqs = clusters.select('xbc_ELE36_01515').unique().get_column('xbc_ELE36_01515').to_list() #careful, column names depends on the clusters found by MMseqs2 

print(f"Number of clusters found by MMseqs2 : {len(repres_seqs)}")

train_rep_seqs, valid_rep_seqs = train_test_split(repres_seqs, test_size=0.1, train_size=0.9, random_state=1) #here we do 80% - 10% -10% because we have less data


583766
Reading the cluster .tsv file and creating train, validation and test from it
Number of clusters found by MMseqs2 : 583766


In [54]:
train_rep_seqs, test_rep_seqs = train_test_split(train_rep_seqs, test_size=0.1111, train_size=0.8889, random_state=1) # 0.11 x 0.89 = 0.10
training_sequences_cluster = clusters.filter(pl.col('xbc_ELE36_01515').is_in(train_rep_seqs))


In [55]:
train_rep_seqs_df = df_full_info.filter(pl.col('gene_id').is_in(train_rep_seqs)) 


In [56]:
train_rep_seqs_df

gene_id,K0,AA_seq
str,str,str
"""pfi:PFC_01010""","""K07061""","""MYEAIFDLEAITPL…"
"""pfi:PFC_02110""","""K07577""","""MKFDVVGFDWKGKV…"
"""pfi:PFC_08200""","""K09136""","""MRINCHVSNIEVFE…"
"""apg:APA12_1425…","""K01154""","""MEQLLTGKKRLPGF…"
"""yan:AYJ57_1172…","""K14060""","""MTPTTKTVLYVRVS…"
"""yan:AYJ57_2137…","""K19117""","""MTMLRALNDLYGRL…"
"""hyn:F9K07_0516…","""K19338""","""MGRPKATSKAEGGG…"
"""hyn:F9K07_1895…","""K02674""","""MHTPFQRRFATGLL…"
"""hyn:F9K07_2442…","""K01607""","""MTHPSKDISDMTGA…"
"""minf:MESINF_07…","""K02034""","""MRWTDIKDSIRDFW…"


In [57]:
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import json
import numpy as np

df_polars = pl.read_parquet(data_path + "Prok_proteins.parquet")
df_full_info = df_polars.unique()

repres_seqs = clusters.select('xbc_ELE36_01515').unique().get_column('xbc_ELE36_01515').to_list()
print(len(repres_seqs)) #the number of representatitve sequences is the number of clustersdf_full_info = pl.read_parquet(data_path + 'Prok_proteins.parquet')

print("Reading the cluster .tsv file and creating train, validation and test from it")

repres_seqs = clusters.select('xbc_ELE36_01515').unique().get_column('xbc_ELE36_01515').to_list() #careful, column names depends on the clusters found by MMseqs2 

print(f"Number of clusters found by MMseqs2 : {len(repres_seqs)}")

train_rep_seqs, valid_rep_seqs = train_test_split(repres_seqs, test_size=0.1, train_size=0.9, random_state=1) #here we do 80% - 10% -10% because we have less data
train_rep_seqs, test_rep_seqs = train_test_split(train_rep_seqs, test_size=0.1111, train_size=0.8889, random_state=1) # 0.11 x 0.89 = 0.10
training_sequences_cluster = clusters.filter(pl.col('xbc_ELE36_01515').is_in(train_rep_seqs))
valid_sequences_cluster = clusters.filter(pl.col('xbc_ELE36_01515').is_in(valid_rep_seqs))
test_sequences_cluster = clusters.filter(pl.col('xbc_ELE36_01515').is_in(test_rep_seqs))
#Now we want to keep only KOs that are in the training set
train_rep_seqs_df = df_full_info.filter(pl.col('gene_id').is_in(train_rep_seqs)) 
raw_valid_rep_seqs_df = df_full_info.filter(pl.col('gene_id').is_in(valid_rep_seqs)) 
raw_test_rep_seqs_df = df_full_info.filter(pl.col('gene_id').is_in(test_rep_seqs)) 
train_kos = list(set(train_rep_seqs_df.select('K0').to_series().to_list()))
valid_kos = set(raw_valid_rep_seqs_df.select('K0').to_series().to_list())
test_kos = set(raw_test_rep_seqs_df.select('K0').to_series().to_list())
num_labels = len(train_kos)

valid_rep_seqs_df = raw_valid_rep_seqs_df.filter(pl.col('K0').is_in(train_kos)) #final valid set
test_rep_seqs_df = raw_test_rep_seqs_df.filter(pl.col('K0').is_in(train_kos)) #final test set
lost_kos_valid = len(raw_valid_rep_seqs_df) - len(valid_rep_seqs_df)
lost_kos_test = len(raw_test_rep_seqs_df) - len(test_rep_seqs_df)
# Creating the labels from the original KO string
train_kos.sort()
# Converting the labels from strings to vector indices (as proper inputs for pytorch loss functions)
targets = {k: v for v, k in enumerate(train_kos)}
train_label_col = train_rep_seqs_df.select('K0').to_series().map_elements(lambda x: targets.get(x))
valid_label_col = valid_rep_seqs_df.select('K0').to_series().map_elements(lambda x: targets.get(x))
test_label_col = test_rep_seqs_df.select('K0').to_series().map_elements(lambda x: targets.get(x))
train_rep_seqs_df = train_rep_seqs_df.with_columns(pl.Series(name="label", values=train_label_col))
valid_rep_seqs_df = valid_rep_seqs_df.with_columns(pl.Series(name="label", values=valid_label_col))
test_rep_seqs_df = test_rep_seqs_df.with_columns(pl.Series(name="label", values=test_label_col))

#make sure we have no duplicated rows
train_rep_seqs_df, valid_rep_seqs_df, test_rep_seqs_df = train_rep_seqs_df.unique(subset=["gene_id"]), valid_rep_seqs_df.unique(subset=["gene_id"]), test_rep_seqs_df.unique(subset=["gene_id"])

    #write the number of labels and the dictionary in a .txt file
with open('variables_to_pass.txt', 'w') as convert_file: 
    convert_file.write(str(num_labels) + '\n')
    convert_file.write(json.dumps(targets))
print("Labels created, converting to pandas")

583766
Reading the cluster .tsv file and creating train, validation and test from it
Number of clusters found by MMseqs2 : 583766
Labels created, converting to pandas


In [47]:
train_rep_seqs_df

gene_id,K0,AA_seq,label
str,str,str,i64
"""mamo:A6B35_272…","""K00305""","""MDWLGEAGLSAPVA…",221
"""pko:PKOR_12025…","""K14228""","""MLIRIVRMTFKPEK…",6398
"""thig:FE785_006…","""K08982""","""MNWNYWNMHGYDGM…",4540
"""caer:CSV91_066…","""K00817""","""MALARPAGRPNMAE…",547
"""ladl:NCTC12735…","""K07121""","""MQLKSLISQLSILV…",3769
"""plei:Q9312_118…","""K03088""","""MKTANINHSNMDQA…",2086
"""tat:KUM_1381""","""K02806""","""MNQISANLSQDNIL…",1916
"""rho:RHOM_13975…","""K03726""","""MEYIKSIINSIEKV…",2550
"""bmur:ABE28_002…","""K07213""","""MRNENDVNKVLHAL…",3833
"""ater:MW290_249…","""K01953""","""VCGLAGVIADRTLT…",1318


In [49]:
train_rep_seqs_df, valid_rep_seqs_df, test_rep_seqs_df = train_rep_seqs_df.to_pandas(), valid_rep_seqs_df.to_pandas(), test_rep_seqs_df.to_pandas() 
#computing the class weights for the losses
print('Computing class weights')
weights = compute_class_weight(class_weight='balanced', classes=np.sort(train_rep_seqs_df['label'].unique()) , y=train_rep_seqs_df['label'])

print(weights, len(weights))
# Write the data to text 
np.savetxt('weights.txt', weights)
print("Weights saved")
print("Writing train and test")
#writing train and test
train_rep_seqs_df.to_parquet(data_path + 'train_dataset.parquet', index=False)
valid_rep_seqs_df.to_parquet(data_path + 'validation_dataset.parquet', index=False)
test_rep_seqs_df.to_parquet(data_path + 'test_dataset.parquet', index=False)
print("Train, validation and test written")

Computing class weights
[ 0.7225835  11.38069013  0.47918695 ...  0.52933442 15.1742535
 45.5227605 ] 10259
Weights saved
Writing train and test
Train, validation and test written


In [12]:
repres_seqs

['pdx:Psed_1303',
 'bbae:FRD01_11020',
 'rsc:RCFBP_11130',
 'tam:Theam_0428',
 'lcg:L3BBH23_07480',
 'pls:VT03_11000',
 'slit:JQC75_05665',
 'nall:PP769_14850',
 'vas:GT360_06765',
 'dfc:DFI_12930',
 'psel:GM415_17240',
 'ccys:SAMEA4530656_2798',
 'psap:JHX88_01655',
 'crn:CAR_c07270',
 'fbt:D770_25315',
 'gps:C427_0400',
 'rtr:RTCIAT899_PB00400',
 'tea:KUI_1544',
 'brm:Bmur_2337',
 'lagg:B0E33_27700',
 'bcq:BCQ_4950',
 'cok:COCCU_00945',
 'ppsc:EHS13_14275',
 'gba:J421_2110',
 'nwl:NWFMUON74_26060',
 'afus:EYZ66_13470',
 'barr:Bra60_007800',
 'sspa:K0I31_10290',
 'xne:XNC1_3487',
 'ntm:BTDUT50_03040',
 'pom:MED152_13219',
 'wik:H8R18_00680',
 'enn:FRE64_13905',
 'blq:L21SP5_03704',
 'mbra:NMG68_00655',
 'hha:Hhal_0255',
 'pls:VT03_02515',
 'pmao:PMYSY11_0102',
 'bbw:BDW_03585',
 'sgd:ELQ87_17580',
 'mkr:MKOR_14900',
 'jet:L3J17_05400',
 'cko:CKO_03471',
 'spsw:Sps_03623',
 'hyh:D3Y59_15535',
 'cmet:K6K41_10920',
 'mmar:MODMU_4196',
 'tsb:HMY34_07245',
 'lpil:LIP_1011',
 'rtg:NCTC13098

Now we want to split the clusters we have to share them across train/test and validation. However we have a second constraint : the model must be trained on at least one of each label to make relevant predictions.

In [6]:
from sklearn.model_selection import train_test_split

def split_data(data, valid_ratio=0.15, test_ratio=0.15, random_state=None):
    """
    Split data into train, validation, and test sets using scikit-learn's train_test_split function.
    
    Parameters:
        data (list or array-like): The data to be split.
        valid_ratio (float): Ratio of the data to be allocated to the validation set.
        test_ratio (float): Ratio of the data to be allocated to the test set.
        random_state (int or None): Random seed for reproducibility.
        
    Returns:
        tuple: Three lists/arrays containing the train, validation, and test sets, respectively.
    """
    # First, split data into train and test
    train_data, test_data = train_test_split(data, test_size=test_ratio, random_state=random_state)
    
    # Then, split train_data into train and validation
    train_data, valid_data = train_test_split(train_data, test_size=valid_ratio/(1-test_ratio), random_state=random_state)
    
    return train_data, valid_data, test_data

# Example usage:
train_rep_seqs, valid_rep_seqs, test_rep_seqs = split_data(repres_seqs, valid_ratio=0.10, test_ratio=0.10, random_state=1)

#Here we get all the sequences that are part of our training, validation and test clusters
training_sequences_cluster = clusters.filter(pl.col('mbas:ALGA_4158').is_in(train_rep_seqs))
valid_sequences_cluster = clusters.filter(pl.col('mbas:ALGA_4158').is_in(valid_rep_seqs))
test_sequences_cluster = clusters.filter(pl.col('mbas:ALGA_4158').is_in(test_rep_seqs))

In [7]:
len(train_rep_seqs)

466773

In [8]:
train_sequences=training_sequences_cluster.get_column('mbas:ALGA_4158_duplicated_0').to_list() #taking genes of the second column
valid_sequences=valid_sequences_cluster.get_column('mbas:ALGA_4158_duplicated_0').to_list()
test_sequences=test_sequences_cluster.get_column('mbas:ALGA_4158_duplicated_0').to_list()

In [9]:
train_sequences

['cft:CFF04554_0476',
 'cff:CFF8240_0490',
 'cfv:CFVI03293_0458',
 'cft:CFF04554_0454',
 'cfx:CFV97608_0456',
 'cfv:CFVI03293_0456',
 'cfz:CSG_5600',
 'cfz:CSG_5620',
 'camp:CFT03427_0470',
 'camp:CFT03427_0470',
 'camp:CFT03427_0471',
 'camp:CFT03427_0471',
 'cff:CFF8240_0456',
 'cfv:CFVI03293_0460',
 'cff:CFF8240_0464',
 'cft:CFF04554_0942',
 'camp:CFT03427_0934',
 'camp:CFT03427_0934',
 'cff:CFF8240_0941',
 'chyo:CHH_0994',
 'cfp:CR44_04745',
 'cfz:CSG_10500',
 'cfx:CFV97608_1002',
 'cfv:CFVI03293_0813',
 'caj:CIG1485E_0984',
 'camy:CSUIS_1179',
 'clx:CLAN_1093',
 'cgra:CGRAC_0307',
 'cinf:CINF_1431',
 'apoc:APORC_0659',
 'athr:ATH_0662',
 'tnr:Thena_0920',
 'taci:TDSAC_0933',
 'cft:CFF04554_1636',
 'cfz:CSG_17680',
 'camp:CFT03427_1586',
 'camp:CFT03427_1586',
 'cfp:CR44_08060',
 'cfx:CFV97608_1761',
 'cff:CFF8240_1634',
 'cfv:CFVI03293_1658',
 'cinf:CINF_0427',
 'haei:MUN82_01445',
 'hcf:MUN80_11000',
 'chrj:CHRYMOREF3P_1850',
 'hvl:MUN86_05180',
 'fag:K5I29_00370',
 'chrc:QGN23_0

In [11]:
train_rep_seqs_df = df_polars.filter(pl.col('gene_id').is_in(train_rep_seqs)) #seeing if representative sequences cover a lot of the KOs 
train_seqs_df =  df_polars.filter(pl.col('gene_id').is_in(train_sequences)) #seeing if train clustered sequences cover much more ko

raw_valid_rep_seqs_df = df_polars.filter(pl.col('gene_id').is_in(valid_rep_seqs)) #seeing if representative sequences cover a lot of the KOs 
valid_seqs_df =  df_polars.filter(pl.col('gene_id').is_in(valid_sequences)) #seeing if train clustered sequences cover much more ko


raw_test_rep_seqs_df = df_polars.filter(pl.col('gene_id').is_in(test_rep_seqs)) #seeing if representative sequences cover a lot of the KOs 
test_seqs_df =  df_polars.filter(pl.col('gene_id').is_in(test_sequences)) #seeing if train clustered sequences cover much more ko

In [12]:
train_rep_seqs_df.select(('K0')).n_unique()

10274

In [13]:
#This is the numer
counting_train_kos = train_rep_seqs_df.select('K0').to_series().value_counts().sort(by='counts').rename({"counts": "occurrences"})
counting_train_kos.select('occurrences').to_series().value_counts().sort(by='counts') #1514 KOs have only one sequence

occurrences,counts
u32,u32
196,1
209,1
210,1
212,1
216,1
247,1
249,1
254,1
257,1
261,1


In [14]:
## do the same for validation and test but looking at the numbers of common KOs
train_kos = set(train_rep_seqs_df.select('K0').to_series().to_list())
valid_kos = set(raw_valid_rep_seqs_df.select('K0').to_series().to_list())
test_kos = set(raw_test_rep_seqs_df.select('K0').to_series().to_list())


print(f'Number of train KOs : {len(train_kos)}, Number of valid KOs : {len(valid_kos)}, Number of test KOs : {len(test_kos)}') 

def count_unique_values(list1, list2):
    count = 0
    for value in list1:
        if value not in list2:
            count += 1
    return count

lost_kos_valid = count_unique_values(valid_kos, train_kos)
lost_kos_test = count_unique_values(test_kos, train_kos)

valid_rep_seqs_df = raw_valid_rep_seqs_df.filter(pl.col('K0').is_in(train_kos))
test_rep_seqs_df = raw_test_rep_seqs_df.filter(pl.col('K0').is_in(train_kos))
print(f'To see all KOs in training, we had to suppress {lost_kos_valid} KOs in validation set and {lost_kos_test} in test set')

Number of train KOs : 10274, Number of valid KOs : 6804, Number of test KOs : 6787
To have all seen KOs in training, we had to suppress 171 KOs in validation set and 175 in test set


In [15]:
train_rep_seqs_df.write_parquet("rep_train_seqs.parquet")
valid_rep_seqs_df.write_parquet("rep_valid_seqs_df.parquet")
test_rep_seqs_df.write_parquet("rep_test_seqs.parquet")

In [19]:
nb_kos_rep=train_rep_seqs_df.select('K0').n_unique()
nb_kos_seqs=train_seqs_df.select('K0').n_unique()
print(f"With {train_seqs_df.shape[0] - train_rep_seqs_df.shape[0]} more lines in the dataset \
containing all the training sequences, the representative sequences of the clusters have {nb_kos_seqs - nb_kos_rep} less KOs")

print(f'Indeed cluster representative sequences have {nb_kos_rep} KOs, while the total training sequences have {nb_kos_seqs}')

With 11957441 more lines in the dataset containing all the training sequences, the representative sequences of the clusters have 769 less KOs
Indeed cluster representative sequences have 10274 KOs, while the total training sequences have 11043


In [96]:
training_seqs_polars

gene_id,K0,AA_seq
str,str,str
"""pfi:PFC_00005""","""K02035""","""MRKKLVGILTILVA…"
"""pfi:PFC_00010""","""K02033""","""MGLGKYLLIRAINA…"
"""pfi:PFC_00015""","""K02034""","""MRWVDLKESLSDFW…"
"""pfi:PFC_00035""","""K06859""","""MYKEPFGVKVDFET…"
"""pfi:PFC_00040""","""K23264""","""MRWRVRVLVRLKEG…"
"""pfi:PFC_00045""","""K23265""","""MVKFAVVVFPGTNC…"
"""pfi:PFC_00050""","""K23269""","""MFPHEEKLIREKLG…"
"""pfi:PFC_00060""","""K23997""","""MKIEDVYIWDINAK…"
"""pfi:PFC_00065""","""K01681""","""MYLEEKDGIKFYSL…"
"""pfi:PFC_00075""","""K01647""","""MNTEKYLAKGLEDV…"


In [105]:
training_seqs_polars.select('K0').to_series().value_counts().sort(by='counts')

K0,counts
str,u32
"""K26351""",1
"""K16966""",1
"""K20297""",1
"""K20377""",1
"""K07886""",1
"""K10293""",1
"""K27042""",1
"""K09250""",1
"""K18137""",1
"""K05584""",1


In [98]:
# Trying to check which of the KOs qre too much in our dataset

