In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def export_fasta(data, filename):
    out_str = ">Seq" + data.index.astype(str) + " pH" + data.mean_growth_PH.astype(str) + "\n" + data.sequence
    np.savetxt(filename, out_str, fmt = "%s")

In [3]:
data = pd.read_csv("../../data/train_set.csv")

In [4]:
def split_and_export(data, threshold, prefix = ""):
    export_fasta(data[data.mean_growth_PH < threshold], f"{prefix}seqs_ph_lt_{threshold}.fasta")
    export_fasta(data[data.mean_growth_PH >= threshold], f"{prefix}seqs_ph_get_{threshold}.fasta")
    print(f"Exported for threshold {threshold} \n   {(data.mean_growth_PH < threshold).sum()} lower | {(data.mean_growth_PH >= threshold).sum()}")

In [10]:
for thr in range(3, 12):
    split_and_export(data, thr)

Exported for threshold 3 
   551 lower | 104450
Exported for threshold 4 
   965 lower | 104036
Exported for threshold 5 
   1825 lower | 103176
Exported for threshold 6 
   3475 lower | 101526
Exported for threshold 7 
   21356 lower | 83645
Exported for threshold 8 
   101747 lower | 3254
Exported for threshold 9 
   104082 lower | 919
Exported for threshold 10 
   104965 lower | 36
Exported for threshold 11 
   104978 lower | 23


In [13]:
CLUSTER_FILE = "clusters/ident90"

with open(CLUSTER_FILE, "r") as f:
    lines = f.readlines()

FileNotFoundError: [Errno 2] No such file or directory: 'clusters/ident90'

In [14]:
representative = set(x.split()[1] for x in lines[::2])

NameError: name 'lines' is not defined

In [25]:
data["representative"] = ("Seq" + data.index.astype(str)).isin(representative)

In [28]:
data[data.representative]

Unnamed: 0,mean_growth_PH,sequence,representative
0,7.0,MKKRAHIISFILILALLFTGCSGNKENTSKEPVKETTEKGTGNIKT...,True
1,7.0,MGKGKRKKRIALYFKRAAVAMLVMVMLLQPIPGTAGSSVKSVEAAV...,True
2,7.0,MKVNNKNSARKLLSLFLGLVLIFSTLSFSNQAAAADKGTWAPNTTY...,True
3,7.0,MRKKVTAVLTALVLTVSSILANPFAYPDAVKADTEGNPAAASNSNG...,True
4,7.0,MKRQSRIISFLVAVIMIATVIMPATVVQANASGVFIRVNQVGYKPS...,True
...,...,...,...
104996,7.8,MPYRSNTYPRANRRSKSATFSTLLTILFIASHFAFGGIPASASMDG...,True
104997,7.8,MRMGKVEVSLVVVVGASMTALAVFGVLPKDPVPPAIAKATLWTPAP...,True
104998,7.8,MTTTLPTPHRFTGLALSAALATTTVLTLSPSAVLAVPAGGYGDLVE...,True
104999,7.8,MRDFQAPGRSAVLATNGMCATSHPLAAQAAIDILKRGGNAMDAAIA...,True


In [34]:
for thr in range(3, 12):
    split_and_export(data[data.representative], thr, "repr_")

Exported for threshold 3 
   256 lower | 28085
Exported for threshold 4 
   464 lower | 27877
Exported for threshold 5 
   850 lower | 27491
Exported for threshold 6 
   2058 lower | 26283
Exported for threshold 7 
   8471 lower | 19870
Exported for threshold 8 
   25622 lower | 2719
Exported for threshold 9 
   27538 lower | 803
Exported for threshold 10 
   28305 lower | 36
Exported for threshold 11 
   28318 lower | 23


In [35]:
for thr in range(3, 12):
    split_and_export(data[data.representative & (data.mean_growth_PH != 7)], thr, "not7_")

Exported for threshold 3 
   256 lower | 17505
Exported for threshold 4 
   464 lower | 17297
Exported for threshold 5 
   850 lower | 16911
Exported for threshold 6 
   2058 lower | 15703
Exported for threshold 7 
   8471 lower | 9290
Exported for threshold 8 
   15042 lower | 2719
Exported for threshold 9 
   16958 lower | 803
Exported for threshold 10 
   17725 lower | 36
Exported for threshold 11 
   17738 lower | 23


In [8]:
train_data = pd.read_csv("../train.csv", index_col=0)
train_data

Unnamed: 0,mean_growth_PH,sequence,representative,is7
99973,7.40,MEANHGMNNYIKLAFVFGITTMATSYADTVAPPTLLTAQKLPQLQQ...,True,False
84793,7.00,MEFFKKTALAALVMGFSGAALALPNITILATGGTIAGGGDSATKSN...,False,True
27864,6.50,MSPLGILRRHRVAALLGAALIISPVVVSFAQSANSTGVSKIVATTQ...,True,False
46228,7.80,MKKQYWYVIITYVAMQLSSLVGVPLLAHSGFINASNKDIAISIASG...,False,False
6028,7.00,MSKKKMAITLSAMLSATIIPSFTMDVHAEKKEETKNTKIELENGMT...,False,True
...,...,...,...,...
69002,7.00,MEIIMRNLCFLLTLVATLLLHGRLIAAALPQDEKLITGQLDNGLRY...,False,True
75674,7.00,MSKHPKLLVLALACLACAGRASAAPASDEVARLAQRCAPDVSPLTM...,False,True
50377,7.45,MSRAGSLMLVLGTALWLCGCSGMNSENKRVAPVAEKRPHTMSLHGV...,True,False
87875,7.00,MSAGRLNKKSLGIVMLLSVGLLLAGCSGSKSSDTGTYSGSVYTVKR...,False,True


In [12]:
for thr in range(3, 12):
    split_and_export(train_data[train_data.representative & ~train_data.is7], thr, "../clustering/splits/train_repr_")

Exported for threshold 3 
   205 lower | 14004
Exported for threshold 4 
   378 lower | 13831
Exported for threshold 5 
   689 lower | 13520
Exported for threshold 6 
   1656 lower | 12553
Exported for threshold 7 
   6740 lower | 7469
Exported for threshold 8 
   12028 lower | 2181
Exported for threshold 9 
   13576 lower | 633
Exported for threshold 10 
   14177 lower | 32
Exported for threshold 11 
   14187 lower | 22


In [10]:
train_data[train_data.representative & ~train_data.is7]

Unnamed: 0,mean_growth_PH,sequence,representative,is7
99973,7.40,MEANHGMNNYIKLAFVFGITTMATSYADTVAPPTLLTAQKLPQLQQ...,True,False
27864,6.50,MSPLGILRRHRVAALLGAALIISPVVVSFAQSANSTGVSKIVATTQ...,True,False
22995,6.80,MNSKKFQWTVTSLLSAASLFISGIATVNAETYEVKSGDTLSKIALE...,True,False
56396,6.00,MQPSRKTAGIAASMAGILTATVLAVGVSLPAHAKKTPHDTPTFTDV...,True,False
88370,4.80,MLQILARRFGAGALLFMFLTGACTAAPACGPDKLGTERTLTLSTAG...,True,False
...,...,...,...,...
59028,8.00,MKRKALISLSLALSTAFLPALPTSATTEVMTTDSQGNRVETHTLPK...,True,False
19407,5.90,MRDPFALPRQLRRAALLALLALLAACAPLPPRNPLATWVPSPNHDI...,True,False
103831,6.50,MPIASLILFSRRAPQFVSSFISPTLRSASPGEGAPSGRGLGRAALA...,True,False
102833,9.00,MKYAIQLILMVLTLSVLSACGQPASTEVSPITVLMYHHFHEDAAQE...,True,False


In [None]:
%ls ../clustering/splits/