In [63]:
from Bio import SeqIO
import pandas as pd
import numpy as np
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils.ProtParam import ProtParamData
from quantiprot.metrics.aaindex import get_aa2volume, get_aa2hydropathy
from quantiprot.metrics.basic import average
import difflib
from sklearn.feature_extraction.text import CountVectorizer as CVec

In [5]:
df=pd.read_csv("train.csv", index_col="seq_id")


In [16]:
df["fasta"]=">" + df.index.astype(str) +"\n"+ df["protein_sequence"]

In [25]:
np.savetxt("fasta.txt", df["fasta"].values, fmt='%s')

In [44]:
df_test=pd.read_csv("test.csv", index_col="seq_id")

In [27]:
df_test["fasta"]=">" + df_test.index.astype(str) +"\n"+ df_test["protein_sequence"]

In [28]:
np.savetxt("test.fasta", df_test["fasta"].values, fmt='%s')

In [132]:
analysed_seq = ProteinAnalysis(df.iloc[1]["protein_sequence"])

In [143]:
analysed_seq.flexibility()

[1.0370119047619046,
 1.018095238095238,
 1.0032261904761903,
 1.0253571428571429,
 1.045154761904762,
 1.0449404761904764,
 1.0419404761904763,
 1.0465476190476193,
 1.0300357142857144,
 1.0341666666666667,
 1.0378809523809525,
 1.00875,
 1.0188690476190476,
 1.0077857142857143,
 0.9874404761904763,
 1.0219523809523807,
 1.0007023809523812,
 1.0151309523809524,
 0.9918928571428574,
 1.038714285714286,
 1.0298333333333334,
 1.0542976190476192,
 1.0450000000000002,
 1.067047619047619,
 1.0611071428571428,
 1.0445714285714285,
 1.0412261904761906,
 1.0338095238095237,
 1.0371309523809522,
 1.0293452380952381,
 1.0165119047619047,
 1.0298571428571426,
 1.0139166666666666,
 1.0214642857142857,
 1.0162142857142857,
 1.0135714285714286,
 1.0257023809523809,
 0.9887023809523808,
 1.0262738095238095,
 1.0122619047619048,
 1.0004523809523809,
 0.9910238095238096,
 1.0205357142857143,
 0.9961309523809524,
 1.0144761904761903,
 1.0010357142857143,
 1.0081309523809525,
 1.0023333333333333,
 1.0117

In [2]:
def analize_prot(prot):
    comp={}
    analysed_seq = ProteinAnalysis(prot)
    MW=  analysed_seq.molecular_weight()
    comp["MW"]=MW
    Gravity = analysed_seq.gravy()
    comp["Grav"]=Gravity
    aa_composition = analysed_seq.count_amino_acids()
    comp["aa"]=aa_composition
    aa_percentage=analysed_seq.get_amino_acids_percent()
    comp["aa_per"]=aa_percentage
    HP = analysed_seq.protein_scale(window=7, param_dict=ProtParamData.kd)
    comp["HP"]=HP
    Sec_Str = analysed_seq.secondary_structure_fraction()
    comp["second"]=Sec_Str
    arm = analysed_seq.aromaticity()
    comp["arm"]=arm
    inest = analysed_seq.instability_index()
    comp["inest"]=inest
    flex = analysed_seq.flexibility()
    comp["flex"]= flex
    return comp

In [95]:
analize_prot(df.iloc[1]["protein_sequence"])["second"][2]

0.36713286713286714

In [None]:
msa=[]
for records in SeqIO.parse("all.msa", "fasta"):
    ident=records.id
    sequence=records.seq
    csv_format=str(ident+","+sequence)
with open('all_msa.csv', 'w') as f:
    for item in msa:
        f.write("%s\n" % item)

In [70]:
df.drop(["fasta", "data_source"], axis=1, inplace=True)

In [73]:
df_test.drop(["fasta", "data_source"], axis=1, inplace=True)

In [85]:
df["MW"]= df["protein_sequence"].apply(lambda x: analize_prot(x)["MW"])

In [83]:
df_test["MW"]= df_test["protein_sequence"].apply(lambda x: analize_prot(x)["MW"])

In [86]:
df["Grav"]= df["protein_sequence"].apply(lambda x: analize_prot(x)["Grav"])

In [87]:
df_test["Grav"]= df_test["protein_sequence"].apply(lambda x: analize_prot(x)["Grav"])

In [88]:
df["aa_per"]= df["protein_sequence"].apply(lambda x: analize_prot(x)["aa_per"])

In [45]:
df_test["aa_per"]= df_test["protein_sequence"].apply(lambda x: analize_prot(x)["aa_per"])

In [96]:
df["second_str_0"]= df["protein_sequence"].apply(lambda x: analize_prot(x)["second"][0])
df["second_str_1"]= df["protein_sequence"].apply(lambda x: analize_prot(x)["second"][1])
df["second_str_2"]= df["protein_sequence"].apply(lambda x: analize_prot(x)["second"][2])

In [97]:
df_test["second_str_0"]= df_test["protein_sequence"].apply(lambda x: analize_prot(x)["second"][0])
df_test["second_str_1"]= df_test["protein_sequence"].apply(lambda x: analize_prot(x)["second"][1])
df_test["second_str_2"]= df_test["protein_sequence"].apply(lambda x: analize_prot(x)["second"][2])

In [100]:
df.to_csv("training_analized.csv")

In [101]:
df_test.to_csv("test_analized.csv")

In [103]:
df_complete=df.join(pd.json_normalize(df["aa_per"]))

In [46]:
df_test_complete=df_test.join(pd.json_normalize(df_test["aa_per"]))

In [49]:
pd.json_normalize(df_test["aa_per"]).to_csv("aa_prop.csv")

In [106]:
df_test_complete.drop(["aa_per"], axis=1, inplace=True)

In [108]:
df_complete.drop(["aa_per"], axis=1, inplace=True)

In [117]:
def Average(lst):
    try:
        average=sum(lst) / len(lst)
    except:
        average=np.nan
    return average

In [115]:
Average(analize_prot(df.iloc[1]["protein_sequence"])["HP"])

-1.1171428571428568

In [118]:
df_complete["HP_mean"]= df_complete["protein_sequence"].apply(lambda x: Average(analize_prot(x)["HP"]))

In [119]:
df_test_complete["HP_mean"]= df_test_complete["protein_sequence"].apply(lambda x: Average(analize_prot(x)["HP"]))

In [121]:
df_complete.to_csv("training_analized.csv")

In [122]:
df_test_complete.to_csv("test_analized.csv")

In [126]:
enumerate(difflib.ndiff(df.iloc[1]["protein_sequence"], wild_type))

<enumerate at 0x7f7d293aea00>

In [3]:
df_training=pd.read_csv("training_analized.csv", index_col="seq_id")


In [4]:
df_test=pd.read_csv("test_analized.csv", index_col="seq_id")

In [8]:
df_training.head(1)["protein_sequence"]

seq_id
0    AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...
Name: protein_sequence, dtype: object

In [10]:
df_peptides=pd.read_csv("Training_peptides.csv")


In [18]:
df_test_peptides=pd.read_csv("Test_peptides.csv")

In [14]:
df_training_peptides= pd.concat([df_training, df_peptides], axis=1)

In [28]:
df_test_peptides= pd.concat([df_test, df_test_peptides], axis=1)

In [21]:
df_test_peptides["seq_id"]=df_test_peptides["Unnamed: 0"]+31390

In [24]:
df_test_peptides.drop(columns=["Unnamed: 0"], inplace=True)

In [26]:
df_test_peptides.set_index('seq_id', inplace=True)

In [29]:
df_test_peptides

Unnamed: 0_level_0,protein_sequence,pH,MW,Grav,second_str_0,second_str_1,second_str_2,A,C,D,...,VHSE4,VHSE5,VHSE6,VHSE7,VHSE8,Z1,Z2,Z3,Z4,Z5
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23910.2619,-0.771041,0.230769,0.330317,0.180995,,,,...,0.053439,-0.098416,-0.150633,0.236290,0.022127,0.523891,-0.383801,0.038235,-0.445747,0.425928
31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23909.3202,-0.772851,0.230769,0.330317,0.176471,,,,...,0.055430,-0.081222,-0.146833,0.239548,0.022624,0.520181,-0.380950,0.027466,-0.425249,0.428462
31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,23781.1479,-0.758636,0.231818,0.331818,0.177273,,,,...,0.052045,-0.089045,-0.150545,0.233227,0.022136,0.512136,-0.386727,0.038909,-0.433955,0.429000
31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23869.2761,-0.709050,0.235294,0.330317,0.180995,,,,...,0.043620,-0.095068,-0.153665,0.217873,0.017783,0.483891,-0.402443,0.060226,-0.441176,0.417466
31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23913.3070,-0.707692,0.239819,0.330317,0.180995,,,,...,0.043846,-0.093937,-0.157828,0.219140,0.017738,0.460995,-0.386109,0.048054,-0.439548,0.426652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23936.3852,-0.725792,0.239819,0.330317,0.176471,,,,...,0.044208,-0.086380,-0.159231,0.230181,0.020814,0.471765,-0.388190,0.021538,-0.438416,0.426154
33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23936.3852,-0.728959,0.239819,0.330317,0.180995,,,,...,0.048733,-0.086742,-0.158145,0.231267,0.018597,0.470000,-0.386244,0.022534,-0.437873,0.428778
33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23937.3302,-0.761991,0.235294,0.334842,0.176471,,,,...,0.055475,-0.090226,-0.148100,0.234208,0.017783,0.503167,-0.373032,0.033982,-0.439819,0.432262
33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23920.3428,-0.753394,0.235294,0.334842,0.176471,,,,...,0.052579,-0.087783,-0.158009,0.230045,0.037511,0.481855,-0.379140,0.037602,-0.431448,0.434027


In [60]:
df_training_peptides.to_csv("training_analized_complete.csv")

In [31]:
df_test_peptides.to_csv("test_analized_complete.csv")

In [40]:
for a in df_training_peptides.columns:
    if a in df_test_peptides.columns:
        pass
    else:
        print(a)

tm
Unnamed: 0


In [43]:
df_training_peptides

Unnamed: 0,protein_sequence,pH,tm,MW,Grav,second_str_0,second_str_1,second_str_2,A,C,...,VHSE4,VHSE5,VHSE6,VHSE7,VHSE8,Z1,Z2,Z3,Z4,Z5
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7,36320.7244,0.148094,0.316716,0.211144,0.351906,0.131965,0.002933,...,-0.104340,-0.084897,-0.458152,0.356921,-0.042903,-0.035425,-0.902229,-0.435630,-0.426745,0.298328
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,32837.9931,-1.089510,0.213287,0.160839,0.367133,0.097902,0.000000,...,0.050385,-0.200629,-0.145559,0.449720,-0.010769,0.894091,-0.315175,-0.659545,-0.652028,0.271154
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5,53428.8034,-0.710463,0.209256,0.267606,0.213280,0.100604,0.018109,...,0.129396,-0.086821,-0.146801,0.353561,-0.056559,0.737706,-0.656640,-0.227183,-0.410584,0.198712
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,29475.5996,-0.507925,0.267925,0.215094,0.298113,0.075472,0.018868,...,-0.102755,-0.245962,-0.195623,0.206415,0.083245,0.353132,-0.460755,-0.172792,-0.596981,0.298906
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,49.5,158761.9814,-0.400896,0.271537,0.292901,0.206065,0.059269,0.009649,...,-0.176340,-0.106120,-0.238305,0.096506,0.191275,0.208746,-0.537981,-0.159304,-0.460868,0.345824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8,61997.6230,-0.538434,0.287796,0.242259,0.224044,0.060109,0.021858,...,0.055009,-0.073643,-0.126940,0.259508,-0.019199,0.349144,-0.353843,-0.262350,-0.296412,0.197122
31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2,52637.6897,-0.098934,0.324094,0.217484,0.270789,0.078891,0.010661,...,-0.178188,-0.054606,-0.216162,0.140661,-0.009211,-0.007122,-0.458849,-0.284179,-0.322836,0.258571
31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6,14203.9225,-0.332813,0.265625,0.250000,0.257812,0.101562,0.007812,...,-0.028516,-0.102891,-0.200234,0.116875,-0.086797,0.104766,-0.459375,-0.057344,-0.331484,0.221641
31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7,64367.8724,-0.138111,0.288364,0.268128,0.259696,0.079258,0.008432,...,-0.105481,-0.119359,-0.265885,0.189494,-0.009106,0.130219,-0.659848,-0.183626,-0.447218,0.267167


In [54]:
df_training_peptides=pd.read_csv("test_analized_complete_2.csv", index_col="seq_id")

In [55]:
df_training_peptides

Unnamed: 0_level_0,protein_sequence,pH,MW,Grav,second_str_0,second_str_1,second_str_2,A,C,D,...,VHSE4,VHSE5,VHSE6,VHSE7,VHSE8,Z1,Z2,Z3,Z4,Z5
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23910.2619,-0.771041,0.230769,0.330317,0.180995,0.099548,0.018100,0.067873,...,0.053439,-0.098416,-0.150633,0.236290,0.022127,0.523891,-0.383801,0.038235,-0.445747,0.425928
31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23909.3202,-0.772851,0.230769,0.330317,0.176471,0.099548,0.018100,0.067873,...,0.055430,-0.081222,-0.146833,0.239548,0.022624,0.520181,-0.380950,0.027466,-0.425249,0.428462
31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,23781.1479,-0.758636,0.231818,0.331818,0.177273,0.100000,0.018182,0.068182,...,0.052045,-0.089045,-0.150545,0.233227,0.022136,0.512136,-0.386727,0.038909,-0.433955,429.000000
31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23869.2761,-0.709050,0.235294,0.330317,0.180995,0.099548,0.022624,0.067873,...,0.043620,-0.095068,-0.153665,0.217873,0.017783,0.483891,-0.402443,0.060226,-0.441176,0.417466
31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23913.3070,-0.707692,0.239819,0.330317,0.180995,0.099548,0.018100,0.067873,...,0.043846,-0.093937,-0.157828,0.219140,0.017738,0.460995,-0.386109,0.048054,-0.439548,0.426652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23936.3852,-0.725792,0.239819,0.330317,0.176471,0.095023,0.018100,0.067873,...,0.044208,-0.086380,-0.159231,0.230181,0.020814,0.471765,-0.388190,0.021538,-0.438416,0.426154
33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23936.3852,-0.728959,0.239819,0.330317,0.180995,0.095023,0.018100,0.067873,...,0.048733,-0.086742,-0.158145,0.231267,0.018597,0.470000,-0.386244,0.022534,-0.437873,0.428778
33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23937.3302,-0.761991,0.235294,0.334842,0.176471,0.095023,0.018100,0.067873,...,0.055475,-0.090226,-0.148100,0.234208,0.017783,0.503167,-0.373032,0.033982,-0.439819,0.432262
33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23920.3428,-0.753394,0.235294,0.334842,0.176471,0.095023,0.018100,0.067873,...,0.052579,-0.087783,-0.158009,0.230045,0.037511,0.481855,-0.379140,0.037602,-0.431448,0.434027


In [61]:
df_training_peptides=pd.read_csv("training_analized_complete.csv", index_col="seq_id")

In [66]:
df_training_peptides

Unnamed: 0_level_0,protein_sequence,pH,tm,MW,Grav,second_str_0,second_str_1,second_str_2,A,C,...,VHSE4,VHSE5,VHSE6,VHSE7,VHSE8,Z1,Z2,Z3,Z4,Z5
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7,36320.7244,0.148094,0.316716,0.211144,0.351906,0.131965,0.002933,...,-0.104340,-0.084897,-0.458152,0.356921,-0.042903,-0.035425,-0.902229,-0.435630,-0.426745,0.298328
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,32837.9931,-1.089510,0.213287,0.160839,0.367133,0.097902,0.000000,...,0.050385,-0.200629,-0.145559,0.449720,-0.010769,0.894091,-0.315175,-0.659545,-0.652028,0.271154
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5,53428.8034,-0.710463,0.209256,0.267606,0.213280,0.100604,0.018109,...,0.129396,-0.086821,-0.146801,0.353561,-0.056559,0.737706,-0.656640,-0.227183,-0.410584,0.198712
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,29475.5996,-0.507925,0.267925,0.215094,0.298113,0.075472,0.018868,...,-0.102755,-0.245962,-0.195623,0.206415,0.083245,0.353132,-0.460755,-0.172792,-0.596981,0.298906
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,49.5,158761.9814,-0.400896,0.271537,0.292901,0.206065,0.059269,0.009649,...,-0.176340,-0.106120,-0.238305,0.096506,0.191275,0.208746,-0.537981,-0.159304,-0.460868,0.345824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8,61997.6230,-0.538434,0.287796,0.242259,0.224044,0.060109,0.021858,...,0.055009,-0.073643,-0.126940,0.259508,-0.019199,0.349144,-0.353843,-0.262350,-0.296412,0.197122
31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2,52637.6897,-0.098934,0.324094,0.217484,0.270789,0.078891,0.010661,...,-0.178188,-0.054606,-0.216162,0.140661,-0.009211,-0.007122,-0.458849,-0.284179,-0.322836,0.258571
31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6,14203.9225,-0.332813,0.265625,0.250000,0.257812,0.101562,0.007812,...,-0.028516,-0.102891,-0.200234,0.116875,-0.086797,0.104766,-0.459375,-0.057344,-0.331484,0.221641
31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7,64367.8724,-0.138111,0.288364,0.268128,0.259696,0.079258,0.008432,...,-0.105481,-0.119359,-0.265885,0.189494,-0.009106,0.130219,-0.659848,-0.183626,-0.447218,0.267167


In [64]:
countvect=CVec(ngram_range=(3,3), analyzer="char")

In [67]:
train_vectorizer=countvect.fit_transform(df_training_peptides["protein_sequence"])

In [73]:
pd.Series(train_vectorizer.toarray())

Exception: Data must be 1-dimensional

In [77]:
pd.DataFrame(train_vectorizer.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7990,7991,7992,7993,7994,7995,7996,7997,7998,7999
0,3,0,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,1,1,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,1,0,0,0,0,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31385,0,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
31386,1,0,0,1,0,2,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
31387,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
31388,0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0


In [78]:
df_training_final= pd.concat([df_training_peptides, pd.DataFrame(train_vectorizer.toarray())], axis=1)

In [80]:
df_training_final.to_csv("Training_final.csv")

In [82]:
train_vectorizer=countvect.transform(df_test_peptides["protein_sequence"])

In [83]:
train_vectorizer=pd.DataFrame(train_vectorizer.toarray())

In [86]:
train_vectorizer["seq_id"]=train_vectorizer.index+31390

In [24]:
train_vectorizer.drop(columns=["Unnamed: 0"], inplace=True)

In [87]:
train_vectorizer.set_index('seq_id', inplace=True)

In [88]:
train_vectorizer

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,7990,7991,7992,7993,7994,7995,7996,7997,7998,7999
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31390,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31391,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31392,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31393,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31394,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33798,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33799,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33800,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33801,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
df_test_final= pd.concat([df_test_peptides, train_vectorizer], axis=1)

In [94]:
df_test_final.to_csv("Test_final.csv")

In [93]:
df_training_final

Unnamed: 0,protein_sequence,pH,tm,MW,Grav,second_str_0,second_str_1,second_str_2,A,C,...,7990,7991,7992,7993,7994,7995,7996,7997,7998,7999
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7,36320.7244,0.148094,0.316716,0.211144,0.351906,0.131965,0.002933,...,0,0,0,0,0,0,0,0,0,0
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,32837.9931,-1.089510,0.213287,0.160839,0.367133,0.097902,0.000000,...,0,0,0,0,0,0,0,0,0,0
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5,53428.8034,-0.710463,0.209256,0.267606,0.213280,0.100604,0.018109,...,0,0,0,1,0,0,0,0,0,0
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,29475.5996,-0.507925,0.267925,0.215094,0.298113,0.075472,0.018868,...,0,0,0,0,0,0,0,0,0,0
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,49.5,158761.9814,-0.400896,0.271537,0.292901,0.206065,0.059269,0.009649,...,1,0,0,0,0,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8,61997.6230,-0.538434,0.287796,0.242259,0.224044,0.060109,0.021858,...,1,0,0,0,0,0,0,0,0,0
31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2,52637.6897,-0.098934,0.324094,0.217484,0.270789,0.078891,0.010661,...,0,1,0,0,0,0,0,0,0,0
31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6,14203.9225,-0.332813,0.265625,0.250000,0.257812,0.101562,0.007812,...,0,0,0,1,0,0,0,0,0,0
31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7,64367.8724,-0.138111,0.288364,0.268128,0.259696,0.079258,0.008432,...,1,0,0,0,0,1,0,0,0,0
