In [1]:
import pandas as pd
import numpy as np
from protlearn import features
from protlearn import preprocessing
import os
from Bio import SeqIO


In [2]:
fastas = [file for file in os.listdir() if file.endswith(".fasta")]
fastas

['Bri2NNK_all_df.fasta',
 'Bri2NNK_all_df_1.fasta',
 'Bri2NNK_all_df_2.fasta',
 'Bri2NNK_all_df_3.fasta',
 'Bri2NNK_all_df_SupN.fasta']

In [3]:
# Hydropathy index (Kyte-Doolittle, 1982) KYTJ820101
# Mean polarity (Radzicka-Wolfenden, 1988) RADA880108
fastas = ['Bri2NNK_all_df.fasta']

for fasta_file in fastas:
    seqs = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        seqs.append(str(record.seq))    
    aaind, inds = features.aaindex1(seqs)
    aaindex = pd.DataFrame(aaind, columns=inds, index=seqs)
    aaindex.reset_index(inplace=True)
    aaindex["aa_seq"] = aaindex["index"]
    #aaindex["Hydrophobicity (Kite-Doolittle)"] = aaindex["KYTJ820101"]
    #aaindex = aaindex[["sequence", "Hydrophobicity (Kite-Doolittle)"]]
    #aaindex.to_csv(fasta_file.split(".")[0]+"_hydrophobicity.tsv", sep="\t")
    aaindex.to_csv(fasta_file.split(".")[0]+"_aaindex.tsv", sep="\t")


In [4]:
aaindex

Unnamed: 0,index,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,BIGC670101,...,KARS160114,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122,aa_seq
0,EASNCFAIRHFENKFAVETLICKTVRCVDSWLGL,4.377941,0.973824,1.102059,1.117941,0.609706,0.751176,0.653235,0.418235,84.935294,...,16.973765,12.147059,20.558824,32.059853,6.284324,23.787941,-0.400500,9.678000,3.276206,EASNCFAIRHFENKFAVETLICKTVRCVDSWLGL
1,EASNCFAIRHFENKFAVETLICKTMSQGFQAECA,4.402353,0.840882,1.069706,1.104706,0.606176,0.743235,0.627941,0.417029,81.811765,...,16.922000,12.323529,20.264706,31.324147,6.407941,23.403618,-0.406706,9.626529,2.769235,EASNCFAIRHFENKFAVETLICKTMSQGFQAECA
2,EASNCFAIRHFENKFAVETLICKTRNASLLVRVL,4.365294,0.931176,1.150294,1.180882,0.647647,0.755588,0.622647,0.418471,85.847059,...,17.134765,12.411765,20.500000,31.560059,6.234412,23.630882,-0.406706,9.627529,3.255618,EASNCFAIRHFENKFAVETLICKTRNASLLVRVL
3,EASNCFAIRHFENKFAVETLICKRYYNKVRYVTR,4.400000,0.994118,0.844118,0.930294,0.591471,0.754118,0.661176,0.427176,90.238235,...,19.286353,13.823529,23.529412,35.265941,6.190294,24.330147,-0.496676,10.040265,3.109441,EASNCFAIRHFENKFAVETLICKRYYNKVRYVTR
4,EASNCFAIRHFENKFAVETLICKSMACTAGR,4.399032,0.861290,1.066452,1.120323,0.610323,0.742581,0.640323,0.416484,81.054839,...,16.780903,12.161290,20.290323,30.452290,6.361290,22.828387,-0.446065,9.493613,2.929290,EASNCFAIRHFENKFAVETLICKSMACTAGR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17947,EASNCFAIRHFENKFAVETLICFLYYDDGFYDCR,4.456176,1.048235,1.018529,1.065588,0.569706,0.758824,0.680882,0.419735,87.123529,...,18.394647,13.441176,22.294118,35.000618,6.334118,24.571853,-0.361824,10.068647,2.813206,EASNCFAIRHFENKFAVETLICFLYYDDGFYDCR
17948,EASNCFAIRHFENKFAVETLICFFRHRAPTACGL,4.409706,0.972059,1.111176,1.155882,0.598529,0.759706,0.640294,0.411676,84.100000,...,17.171235,12.676471,20.323529,31.971206,6.158824,23.064412,-0.452441,9.662794,3.110294,EASNCFAIRHFENKFAVETLICFFRHRAPTACGL
17949,EASNCFAIRHFENKFAVETLICFFHQASRECVVS,4.406765,0.925000,1.059118,1.079706,0.602647,0.759118,0.615588,0.412500,84.382353,...,17.432500,12.617647,20.911765,32.736324,6.392059,23.928500,-0.328118,9.831441,3.179647,EASNCFAIRHFENKFAVETLICFFHQASRECVVS
17950,EASNCFAIRHFENKFAVETLICFFDLACYVYIEG,4.398529,1.084118,1.110588,1.159412,0.612059,0.771471,0.622647,0.411824,86.014706,...,17.249824,12.676471,20.705882,33.147882,6.329706,24.004471,-0.381853,9.820941,2.797265,EASNCFAIRHFENKFAVETLICFFDLACYVYIEG


In [5]:
aaindex_hydrophobicity = aaindex[["index", "KYTJ820101"]]
aaindex_hydrophobicity.rename(columns={"index":"sequence", "KYTJ820101": "Hydrophobicity (Kite-Doolittle"}, inplace=True)
aaindex_hydrophobicity

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aaindex_hydrophobicity.rename(columns={"index":"sequence", "KYTJ820101": "Hydrophobicity (Kite-Doolittle"}, inplace=True)


Unnamed: 0,sequence,Hydrophobicity (Kite-Doolittle
0,EASNCFAIRHFENKFAVETLICKTVRCVDSWLGL,0.264706
1,EASNCFAIRHFENKFAVETLICKTMSQGFQAECA,-0.008824
2,EASNCFAIRHFENKFAVETLICKTRNASLLVRVL,0.261765
3,EASNCFAIRHFENKFAVETLICKRYYNKVRYVTR,-0.464706
4,EASNCFAIRHFENKFAVETLICKSMACTAGR,0.093548
...,...,...
17947,EASNCFAIRHFENKFAVETLICFLYYDDGFYDCR,-0.064706
17948,EASNCFAIRHFENKFAVETLICFFRHRAPTACGL,0.170588
17949,EASNCFAIRHFENKFAVETLICFFHQASRECVVS,0.211765
17950,EASNCFAIRHFENKFAVETLICFFDLACYVYIEG,0.517647


In [6]:
aaindex_hydrophobicity.to_csv("Bri2NNK_all_df_hydrophobicity.tsv", sep="\t")