In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from Bio.PDB import *
from os import listdir
import requests
import json
import re

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Index of Hydrophobicity
hydrophobicity = {"A": 0.62, "C": 0.29, "D": -0.9, "E": -0.74, "F": 1.19, "G": 0.48, "H": -0.4, "I": 1.38, "K": -1.5, "L": 1.06, "M": 0.64, 
"N": -0.78, "P": 0.12, "Q": -0.85, "R": -2.53, "S": -0.18,  "T": -0.05, "V": 1.08,  "W": 0.81, "Y": 0.26, "U": 0}

# index of hydrophilicity
hydrophilicity = {"A": -0.5, "C": -1, "D": 3, "E": 3, "F": -2.5, "G": 0, "H": -0.5, "I": -1.8, "K": 3, "L": -1.8, "M": -1.3, 
"N": 0.2, "P": 0, "Q": 0.2, "R": 3, "S": 0.3,  "T": -0.4, "V": -1.5,  "W": -3.4, "Y": -2.3, "U":0}

# index of NCI
nci = {"A": 0.007187, "C": -0.03661, "D": -0.02382, "E": 0.006802, "F": 0.037552, "G": 0.179052, "H": -0.01069, "I": 0.021631, "K": 0.017708, "L": 0.051672, "M": 0.002683, 
"N": 0.005392, "P": 0.239531, "Q": 0.049211, "R": 0.043587, "S": 0.004627,  "T": 0.003352, "V": 0.057004,  "W": 0.037977, "Y": 0.0323599, "U":0}

# index of polarity
polarity = {"A": 8.1, "C": 5.5, "D": 13, "E": 12.3, "F": 5.2, "G": 9, "H": 10.4, "I": 5.2, "K": 11.3, "L": 4.9, "M": 5.7, 
"N": 11.6, "P": 8, "Q": 10.5, "R": 10.5, "S": 9.2,  "T": 8.6, "V": 5.9,  "W": 5.4, "Y": 6.2, "U": 0}

# index of polarizability
polarizability = {"A": 0.046, "C": 0.128, "D": 0.105, "E": 0.151, "F": 0.29, "G": 0, "H": 0.23, "I": 0.186, "K": 0.219, "L": 0.186, "M": 0.221, 
"N": 0.134, "P": 0.131, "Q": 0.18, "R": 0.291, "S": 0.062,  "T": 0.108, "V": 0.14,  "W": 0.409, "Y": 0.298, "U": 0}

# index of SASA
sasa = {"A": 1.181, "C": 1.461, "D": 1.587, "E": 1.862, "F": 2.228, "G": 0.881, "H": 2.025, "I": 1.81, "K": 2.258, "L": 1.931, "M": 2.034, 
"N": 1.655, "P": 1.468, "Q": 1.932, "R": 2.56, "S": 1.298,  "T": 1.525, "V": 1.645,  "W": 2.663, "Y": 2.368, "U": 0}

# index of V
V = {"A": 27.5, "C": 44.6, "D": 40, "E": 62, "F": 115.5, "G": 0, "H": 79, "I": 93.5, "K": 100, "L": 93.5, "M": 94.1, 
"N": 58.7, "P": 41.9, "Q": 80.7, "R": 105, "S": 29.3,  "T": 51.3, "V": 71.5,  "W": 145.5, "Y": 117.3, "U": 0}

# amino_acids to numbers
res_to_num = {"A": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6, "H": 7, "I": 8, "K": 9, "L": 10, "M": 11, 
"N": 12, "P": 13, "Q": 14, "R": 15, "S": 16,  "T": 17, "V": 18,  "W": 19, "Y": 20}

In [3]:
pdb_structures = {}

for file in listdir("./data"):
  pdb_parser = PDBParser(PERMISSIVE=1)
  structure = pdb_parser.get_structure(file[:4], "./data/" + file)
  pdb_structures[file[:4]] = {}
  pdb_structures[file[:4]]['structure'] = structure

In [None]:
# Opening JSON file
f = open('primary_structures.json')

# returns JSON object as a dictionary
data = json.load(f)

# get subunits for pdb files
for i in data['objects']:
  if i['pdbid'] in pdb_structures:
    x = requests.get('https://lomize-group-opm.herokuapp.com//primary_structures/' + str(i['id'])).json()
    pdb_structures[i['pdbid']]['subunits'] = x['subunits']

In [None]:
for key in pdb_structures:
  for model in pdb_structures[key]['structure']:
      
    pdb_structures[key]['seq'] = {}
    pdb_structures[key]['seq']['sequence'] = {}
    pdb_structures[key]['seq']['chain'] = {}
    
    chain_lenght = 0
    selected_chain = ''
    chain_index = 0
    
    for index, chain in enumerate(model):
      # get the chain with the maximun number of residue
      if chain_lenght < len(chain):
        chain_lenght = len(chain)
        selected_chain = chain
        chain_index = index

    pdb_structures[key]['seq']['chain'][selected_chain.id] = []
    
    # get segments
    if len(pdb_structures[key]['subunits']) == 0: # if we don't have information about segments, then protein is outside
      segments = '(0-0)'
    elif len(pdb_structures[key]['subunits'])-1 < index: # if we don't have the specific subunit, then take the first one
      segments = pdb_structures[key]['subunits'][0]['segment']
    elif pdb_structures[key]['subunits'][index]['protein_letter'] == selected_chain.id: # else, take this based on the protein letter
      segments = pdb_structures[key]['subunits'][index]['segment']
    # find all segments
    contents_re = re.findall('\(.*?\)',segments)
    # remove parenthesis 
    contents_re = [i.replace('(', '').replace(')', '') for i in contents_re]
      
    seq = []
    full_sequence = ''
    for res_index, residue in enumerate(selected_chain):
      if residue.resname in amino_acids:
        full_sequence += d3to1[residue.resname]
        # check if residue belongs to transmembrane segments
        is_tm_segment = False
        for sg in contents_re:
          if residue.id[1] in range(int(sg.split('-')[0]), int(sg.split('-')[1]) + 1):
            is_tm_segment = True

        if is_tm_segment:
          seq.append([d3to1[residue.resname], res_to_num[d3to1[residue.resname]], res_index, residue.id[1], hydrophobicity[d3to1[residue.resname]], hydrophilicity[d3to1[residue.resname]], nci[d3to1[residue.resname]], polarity[d3to1[residue.resname]], polarizability[d3to1[residue.resname]], sasa[d3to1[residue.resname]], V[d3to1[residue.resname]], charged[d3to1[residue.resname]], 1])
        else:
          seq.append([d3to1[residue.resname], res_to_num[d3to1[residue.resname]], res_index, residue.id[1], hydrophobicity[d3to1[residue.resname]], hydrophilicity[d3to1[residue.resname]], nci[d3to1[residue.resname]], polarity[d3to1[residue.resname]], polarizability[d3to1[residue.resname]], sasa[d3to1[residue.resname]], V[d3to1[residue.resname]], charged[d3to1[residue.resname]], 0])
    
    pdb_structures[key]['seq']['sequence'] = full_sequence
    pdb_structures[key]['seq']['chain'][selected_chain.id].append(seq)

In [None]:
# write pdb data to json
temp_dict = {}
for key in pdb_structures:
  temp_dict[key] = pdb_structures[key]['seq']

# Serializing json 
json_object = json.dumps(temp_dict, indent = 4)

# Writing to sample.json
with open("pdb_structures.json", "w") as outfile:
    outfile.write(json_object)

In [None]:
# Opening JSON file with fasta data
f = open('pdb_structures.json')
# returns JSON object as a dictionary
pdb_structures = json.load(f)

f.close()

In [None]:
column_names = ["pdb_id", "chain_id", "sequence", "residue", "residue_number", "res_index", "pdb_res_index", 'Hydrophobicity', 'Hydrophilicity', 'NCI', 'Polarity', 'Polarizability', 'SASA', 'V','is_charged', 'tm_segment']
df = pd.DataFrame(columns = column_names)

In [None]:
for key in pdb_structures:
  chain = list(pdb_structures[key]['chain'].keys())[0]
  for row in pdb_structures[key]['chain'][chain][0]:
    temp_dict = {
        'pdb_id': key,
        'chain_id': chain,
        'sequence': pdb_structures[key]['sequence'],
        'residue': row[0],
        'residue_number': row[1],
        'res_index': row[2],
        'pdb_res_index': row[3],
        'Hydrophobicity': row[4],
        'Hydrophilicity': row[5],
        'NCI': row[6],
        'Polarity': row[7],
        'Polarizability': row[8],
        'SASA': row[9],
        'V': row[10],
        'is_charged': row[11],
        'tm_segment': row[12],
    }
    df = df.append(temp_dict, ignore_index = True)

In [None]:
df

Unnamed: 0,pdb_id,chain_id,sequence,residue,residue_number,res_index,pdb_res_index,Hydrophobicity,Hydrophilicity,NCI,Polarity,Polarizability,SASA,V,is_charged,tm_segment
0,1vgo,A,DLLNDGRPETLWLGIGTLLMLIGTFYFIARGWGVTDKEAREYYAIT...,D,3,0,5,-0.90,3,-0.023820,13,0.105,1.587,40,1,0
1,1vgo,A,DLLNDGRPETLWLGIGTLLMLIGTFYFIARGWGVTDKEAREYYAIT...,L,10,1,6,1.06,-1.8,0.051672,4.9,0.186,1.931,93.5,0,0
2,1vgo,A,DLLNDGRPETLWLGIGTLLMLIGTFYFIARGWGVTDKEAREYYAIT...,L,10,2,7,1.06,-1.8,0.051672,4.9,0.186,1.931,93.5,0,0
3,1vgo,A,DLLNDGRPETLWLGIGTLLMLIGTFYFIARGWGVTDKEAREYYAIT...,N,12,3,8,-0.78,0.2,0.005392,11.6,0.134,1.655,58.7,0,0
4,1vgo,A,DLLNDGRPETLWLGIGTLLMLIGTFYFIARGWGVTDKEAREYYAIT...,D,3,4,9,-0.90,3,-0.023820,13,0.105,1.587,40,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161310,7b0o,A,LYFQGHMDRLITLVVSYSIAFSIFALATMAVVYGKWLYYFEIDFLN...,V,18,211,206,1.08,-1.5,0.057004,5.9,0.140,1.645,71.5,0,0
161311,7b0o,A,LYFQGHMDRLITLVVSYSIAFSIFALATMAVVYGKWLYYFEIDFLN...,K,9,212,207,-1.50,3,0.017708,11.3,0.219,2.258,100,1,0
161312,7b0o,A,LYFQGHMDRLITLVVSYSIAFSIFALATMAVVYGKWLYYFEIDFLN...,K,9,213,208,-1.50,3,0.017708,11.3,0.219,2.258,100,1,0
161313,7b0o,A,LYFQGHMDRLITLVVSYSIAFSIFALATMAVVYGKWLYYFEIDFLN...,K,9,214,209,-1.50,3,0.017708,11.3,0.219,2.258,100,1,0


In [None]:
df.to_csv("proteins_dataframe.csv")

### Filtering of Dataframe

In [None]:
df = pd.read_csv('proteins_dataframe.csv', index_col=0)

In [None]:
# add missing residues
new_row = pd.DataFrame({'pdb_id': '4p79', "chain_id": "A", "sequence": "SEFSVAVETFGFFSALGLLLGLTLSNSYWRVSTNTIFENLWYSCATDSLGVSNCWDFPSLALSGYVQGCRALITAILLGFLGLFLGVGLRATNVGNDLSKKAKLLAIAGTLHILAGACGVAISWYAVNITTDFFNPLYAGTKYELGPALYLGWSASLLSILGGICVFSTAAAS", "residue": "S", "residue_number": 16, "res_index": 3, "pdb_res_index": 1, "hydrophobicity": -0.18, "hydrophilicity": 0.3, "NCI": 0.004627, "polarity": 9.2, "polarizability": 0.062, "SASA": 1.298, 'vol': 29.3, 'is_charged': 0, 'tm_segment': 1}, index=[99253])

df = pd.concat([df.iloc[:84562], new_row, df.iloc[84562:]]).reset_index(drop=True)

new_row1 = pd.DataFrame({'pdb_id': '4dve', "chain_id": "A", "sequence": "ATNNQKVKTLTYSAFTAFIIILGFLPGIPIGFIPVPIILQNGIAGGLLGPKYGTISVGAFLALALIGLPVLTGGNGGAASFLGPSGGYRIAWLFTPFLIGFFLKKLKITTSQNWFGELIIVLLFGVIFVDFVGAIWLSFQSNIPLLTSLISNLVFIPGDCIKAILTVVIVRRLRKQGGFELYFR", "residue": "A", "residue_number": 1, "res_index": 1, "pdb_res_index": 1, "hydrophobicity": 0.62, "hydrophilicity":-0.5, "NCI": 0.007187, "polarity": 8.1, "polarizability": 0.046, "SASA": 1.181, 'vol': 27.5, 'is_charged': 0, 'tm_segment': 0}, index=[111654])

df = pd.concat([df.iloc[:111654], new_row1, df.iloc[111654:]]).reset_index(drop=True)

In [None]:
# remove sequences that are outside of membrane
seq_segm = {}

for ind in df.index:
  if df['sequence'][ind] not in seq_segm:
    seq_segm[df['sequence'][ind]] = 0
  
  seq_segm[df['sequence'][ind]] += df['tm_segment'][ind]

outside_seq = []

for key in seq_segm:
  if seq_segm[key] == 0:
    outside_seq.append(key)

df = df[~df['sequence'].isin(outside_seq)]

In [None]:
# remove sequences with length more the 800 amino acids and less than 15 residues
df = df[df['sequence'].str.len() < 800]
df = df[df['sequence'].str.len() >= 15]

In [None]:
unique_seq = df.sequence.unique()

len_of_sequences = []
for seq in unique_seq:
  len_of_sequences.append(len(seq))

plt.hist(len_of_sequences)
plt.title("Length of sequences")
plt.show()

### Create dataframe with neighbor's properties

In [None]:
# create new dataframe with the columns for the neighboor residues
new_df = df

new_col = list(res_to_num.keys())
new_col.append("U")
new_col2 = ["avg_hydrophobicity", "avg_hydrophilicity", "avg_nci", "avg_polarity", "avg_polarizability", "avg_sasa", "avg_vol"]

columns = np.concatenate((new_col, new_col2))

i = 7
for col in columns:
  new_df.insert(i, col, np.zeros(len(new_df)))
  i+=1

In [None]:
final_df = pd.DataFrame(columns = new_df.columns)
final_df

In [None]:
pdbs = new_df['pdb_id'].unique()

for pdb in pdbs:
  temp_df = new_df.loc[new_df['pdb_id'] == pdb].copy()
  first_row = True

  for index in temp_df.index:

    if first_row:
      first_row = False
      
      # check if the first's residue index starts from 1
      if temp_df['res_index'][index] == 1:
        added_value = 1 
      else:
        added_value = 0

    if temp_df['res_index'][index] - added_value < 5:
      temp_df['U'][index] = 5 - temp_df['res_index'][index] + added_value

      # count the previous residues
      i = 1
      for _ in range(5 - temp_df['res_index'][index] + added_value, 5):
        temp_df[ temp_df['residue'][index-i] ][index] += 1
        temp_df['avg_hydrophobicity'][index] += hydrophobicity[temp_df['residue'][index-i]]
        temp_df['avg_hydrophilicity'][index] += hydrophilicity[temp_df['residue'][index-i]]
        temp_df['avg_nci'][index] += nci[temp_df['residue'][index-i]]
        temp_df['avg_polarity'][index] += polarity[temp_df['residue'][index-i]]
        temp_df['avg_polarizability'][index] += polarizability[temp_df['residue'][index-i]]
        temp_df['avg_sasa'][index] += sasa[temp_df['residue'][index-i]]
        temp_df['avg_vol'][index] += V[temp_df['residue'][index-i]]
        i-=1
      
      # count the next 5 residues 
      temp_df[ temp_df['residue'][index+1] ][index] += 1
      temp_df[ temp_df['residue'][index+2] ][index] += 1
      temp_df[ temp_df['residue'][index+3] ][index] += 1
      temp_df[ temp_df['residue'][index+4] ][index] += 1
      temp_df[ temp_df['residue'][index+5] ][index] += 1

      # calculate the averages of neighbors
      temp_df['avg_hydrophobicity'][index] = (hydrophobicity[temp_df['residue'][index+1]] + hydrophobicity[temp_df['residue'][index+2]] + hydrophobicity[temp_df['residue'][index+3]] + hydrophobicity[temp_df['residue'][index+4]]+ hydrophobicity[temp_df['residue'][index+5]] + hydrophobicity[temp_df['residue'][index]] + temp_df['avg_hydrophobicity'][index])/(11 - 5 - temp_df['res_index'][index] + added_value)
      temp_df['avg_hydrophilicity'][index] = (hydrophilicity[temp_df['residue'][index+1]] + hydrophilicity[temp_df['residue'][index+2]] + hydrophilicity[temp_df['residue'][index+3]] + hydrophilicity[temp_df['residue'][index+4]]+ hydrophilicity[temp_df['residue'][index+5]] + hydrophilicity[temp_df['residue'][index]] + temp_df['avg_hydrophilicity'][index] )/(11 - 5 - temp_df['res_index'][index] + added_value)
      temp_df['avg_nci'][index] = (nci[temp_df['residue'][index+1]] + nci[temp_df['residue'][index+2]] + nci[temp_df['residue'][index+3]] + nci[temp_df['residue'][index+4]]+ nci[temp_df['residue'][index+5]] + nci[temp_df['residue'][index]] + temp_df['avg_nci'][index] )/(11 - 5 - temp_df['res_index'][index] + added_value)
      temp_df['avg_polarity'][index] = (polarity[temp_df['residue'][index+1]] + polarity[temp_df['residue'][index+2]] + polarity[temp_df['residue'][index+3]] + polarity[temp_df['residue'][index+4]]+ polarity[temp_df['residue'][index+5]] + polarity[temp_df['residue'][index]] + temp_df['avg_polarity'][index])/(11 - 5 - temp_df['res_index'][index] + added_value)
      temp_df['avg_polarizability'][index] = (polarizability[temp_df['residue'][index+1]] + polarizability[temp_df['residue'][index+2]] + polarizability[temp_df['residue'][index+3]] + polarizability[temp_df['residue'][index+4]]+ polarizability[temp_df['residue'][index+5]] + polarizability[temp_df['residue'][index]] + temp_df['avg_polarizability'][index])/(11 - 5 - temp_df['res_index'][index] + added_value)
      temp_df['avg_sasa'][index] = (sasa[temp_df['residue'][index+1]] + sasa[temp_df['residue'][index+2]] + sasa[temp_df['residue'][index+3]] + sasa[temp_df['residue'][index+4]]+ sasa[temp_df['residue'][index+5]] + sasa[temp_df['residue'][index]] + temp_df['avg_sasa'][index])/(11 - 5 - temp_df['res_index'][index] + added_value)
      temp_df['avg_vol'][index] = (V[temp_df['residue'][index+1]] + V[temp_df['residue'][index+2]] + V[temp_df['residue'][index+3]] + V[temp_df['residue'][index+4]]+ V[temp_df['residue'][index+5]] + V[temp_df['residue'][index]] + temp_df['avg_vol'][index])/(11 - 5 - temp_df['res_index'][index] + added_value)


    elif temp_df['res_index'][index] - added_value > len(temp_df) - 6:
      temp_df['U'][index]  = 5 - (len(temp_df) - temp_df['res_index'][index] - 1 + added_value)

      # count the next residues
      i = 1
      for _ in range(5 - (len(temp_df) - temp_df['res_index'][index] - 1 + added_value), 5):
        temp_df[ temp_df['residue'][index+i] ][index] += 1
        temp_df['avg_hydrophobicity'][index] += hydrophobicity[temp_df['residue'][index+i]]
        temp_df['avg_hydrophilicity'][index] += hydrophilicity[temp_df['residue'][index+i]]
        temp_df['avg_nci'][index] += nci[temp_df['residue'][index+i]]
        temp_df['avg_polarity'][index] += polarity[temp_df['residue'][index+i]]
        temp_df['avg_polarizability'][index] += polarizability[temp_df['residue'][index+i]]
        temp_df['avg_sasa'][index] += sasa[temp_df['residue'][index+i]]
        temp_df['avg_vol'][index] += V[temp_df['residue'][index+i]]
        i+=1

      # count the previous 5 residues
      temp_df[ temp_df['residue'][index-1] ][index] += 1
      temp_df[ temp_df['residue'][index-2] ][index] += 1
      temp_df[ temp_df['residue'][index-3] ][index] += 1
      temp_df[ temp_df['residue'][index-4] ][index] += 1
      temp_df[ temp_df['residue'][index-5] ][index] += 1

      # calculate the averages
      temp_df['avg_hydrophobicity'][index] = (hydrophobicity[temp_df['residue'][index-1]] + hydrophobicity[temp_df['residue'][index-2]] + hydrophobicity[temp_df['residue'][index-3]] + hydrophobicity[temp_df['residue'][index-4]]+ hydrophobicity[temp_df['residue'][index-5]] + hydrophobicity[temp_df['residue'][index]] + temp_df['avg_hydrophobicity'][index])/(11- 5 - (len(temp_df) - temp_df['res_index'][index] - 1 + added_value))
      temp_df['avg_hydrophilicity'][index] = (hydrophilicity[temp_df['residue'][index-1]] + hydrophilicity[temp_df['residue'][index-2]] + hydrophilicity[temp_df['residue'][index-3]] + hydrophilicity[temp_df['residue'][index-4]]+ hydrophilicity[temp_df['residue'][index-5]] + hydrophilicity[temp_df['residue'][index]] + temp_df['avg_hydrophilicity'][index] )/(11- 5 - (len(temp_df) - temp_df['res_index'][index] - 1 + added_value))
      temp_df['avg_nci'][index] = (nci[temp_df['residue'][index-1]] + nci[temp_df['residue'][index-2]] + nci[temp_df['residue'][index-3]] + nci[temp_df['residue'][index-4]]+ nci[temp_df['residue'][index-5]] + nci[temp_df['residue'][index]] + temp_df['avg_nci'][index] )/(11- 5 - (len(temp_df) - temp_df['res_index'][index] - 1 + added_value))
      temp_df['avg_polarity'][index] = (polarity[temp_df['residue'][index-1]] + polarity[temp_df['residue'][index-2]] + polarity[temp_df['residue'][index-3]] + polarity[temp_df['residue'][index-4]]+ polarity[temp_df['residue'][index-5]] + polarity[temp_df['residue'][index]] + temp_df['avg_polarity'][index])/(11- 5 - (len(temp_df) - temp_df['res_index'][index] - 1 + added_value))
      temp_df['avg_polarizability'][index] = (polarizability[temp_df['residue'][index-1]] + polarizability[temp_df['residue'][index-2]] + polarizability[temp_df['residue'][index-3]] + polarizability[temp_df['residue'][index-4]]+ polarizability[temp_df['residue'][index-5]] + polarizability[temp_df['residue'][index]] + temp_df['avg_polarizability'][index])/(11- 5 - (len(temp_df) - temp_df['res_index'][index] - 1 + added_value))
      temp_df['avg_sasa'][index] = (sasa[temp_df['residue'][index-1]] + sasa[temp_df['residue'][index-2]] + sasa[temp_df['residue'][index-3]] + sasa[temp_df['residue'][index-4]]+ sasa[temp_df['residue'][index-5]] + sasa[temp_df['residue'][index]] + temp_df['avg_sasa'][index])/(11- 5 - (len(temp_df) - temp_df['res_index'][index] - 1 + added_value))
      temp_df['avg_vol'][index] = (V[temp_df['residue'][index-1]] + V[temp_df['residue'][index-2]] + V[temp_df['residue'][index-3]] + V[temp_df['residue'][index-4]]+ V[temp_df['residue'][index-5]] + V[temp_df['residue'][index]] + temp_df['avg_vol'][index])/(11- 5 - (len(temp_df) - temp_df['res_index'][index] - 1 + added_value))

    else:
      temp_df[ temp_df['residue'][index-1] ][index] += 1
      temp_df[ temp_df['residue'][index-2] ][index] += 1
      temp_df[ temp_df['residue'][index-3] ][index] += 1
      temp_df[ temp_df['residue'][index-4] ][index] += 1
      temp_df[ temp_df['residue'][index-5] ][index] += 1

      temp_df[ temp_df['residue'][index+1] ][index] += 1
      temp_df[ temp_df['residue'][index+2] ][index] += 1
      temp_df[ temp_df['residue'][index+3] ][index] += 1
      temp_df[ temp_df['residue'][index+4] ][index] += 1
      temp_df[ temp_df['residue'][index+5] ][index] += 1

      temp_df['avg_hydrophobicity'][index] = (hydrophobicity[temp_df['residue'][index-1]] + hydrophobicity[temp_df['residue'][index-2]] + hydrophobicity[temp_df['residue'][index-3]] + hydrophobicity[temp_df['residue'][index-4]]+ hydrophobicity[temp_df['residue'][index-5]] + hydrophobicity[temp_df['residue'][index]] + hydrophobicity[temp_df['residue'][index+1]] + hydrophobicity[temp_df['residue'][index+2]] + hydrophobicity[temp_df['residue'][index+3]] + hydrophobicity[temp_df['residue'][index+4]] + hydrophobicity[temp_df['residue'][index+5]])/11
      temp_df['avg_hydrophilicity'][index] = (hydrophilicity[temp_df['residue'][index-1]] + hydrophilicity[temp_df['residue'][index-2]] + hydrophilicity[temp_df['residue'][index-3]] + hydrophilicity[temp_df['residue'][index-4]]+ hydrophilicity[temp_df['residue'][index-5]] + hydrophilicity[temp_df['residue'][index]] + hydrophilicity[temp_df['residue'][index+1]] + hydrophilicity[temp_df['residue'][index+2]] + hydrophilicity[temp_df['residue'][index+3]] + hydrophilicity[temp_df['residue'][index+4]] + hydrophilicity[temp_df['residue'][index+5]])/11
      temp_df['avg_nci'][index] = (nci[temp_df['residue'][index-1]] + nci[temp_df['residue'][index-2]] + nci[temp_df['residue'][index-3]] + nci[temp_df['residue'][index-4]]+ nci[temp_df['residue'][index-5]] + nci[temp_df['residue'][index]] + nci[temp_df['residue'][index+1]] + nci[temp_df['residue'][index+2]] + nci[temp_df['residue'][index+3]] + nci[temp_df['residue'][index+4]] + nci[temp_df['residue'][index+5]])/11
      temp_df['avg_polarity'][index] = (polarity[temp_df['residue'][index-1]] + polarity[temp_df['residue'][index-2]] + polarity[temp_df['residue'][index-3]] + polarity[temp_df['residue'][index-4]]+ polarity[temp_df['residue'][index-5]] + polarity[temp_df['residue'][index]] + polarity[temp_df['residue'][index+1]] + polarity[temp_df['residue'][index+2]] + polarity[temp_df['residue'][index+3]] + polarity[temp_df['residue'][index+4]] + polarity[temp_df['residue'][index+5]])/11
      temp_df['avg_polarizability'][index] = (polarizability[temp_df['residue'][index-1]] + polarizability[temp_df['residue'][index-2]] + polarizability[temp_df['residue'][index-3]] + polarizability[temp_df['residue'][index-4]]+ polarizability[temp_df['residue'][index-5]] + polarizability[temp_df['residue'][index]] + polarizability[temp_df['residue'][index+1]] + polarizability[temp_df['residue'][index+2]] + polarizability[temp_df['residue'][index+3]] + polarizability[temp_df['residue'][index+4]] + polarizability[temp_df['residue'][index+5]])/11
      temp_df['avg_sasa'][index] = (sasa[temp_df['residue'][index-1]] + sasa[temp_df['residue'][index-2]] + sasa[temp_df['residue'][index-3]] + sasa[temp_df['residue'][index-4]]+ sasa[temp_df['residue'][index-5]] + sasa[temp_df['residue'][index]] + sasa[temp_df['residue'][index+1]] + sasa[temp_df['residue'][index+2]] + sasa[temp_df['residue'][index+3]] + sasa[temp_df['residue'][index+4]] + sasa[temp_df['residue'][index+5]])/11
      temp_df['avg_vol'][index] = (V[temp_df['residue'][index-1]] + V[temp_df['residue'][index-2]] + V[temp_df['residue'][index-3]] + V[temp_df['residue'][index-4]]+ V[temp_df['residue'][index-5]] + V[temp_df['residue'][index]] + V[temp_df['residue'][index+1]] + V[temp_df['residue'][index+2]] + V[temp_df['residue'][index+3]] + V[temp_df['residue'][index+4]] + V[temp_df['residue'][index+5]])/11
  
  final_df = pd.concat([final_df, temp_df], axis=0)

In [None]:
final_df.to_csv("final_proteins_dataframe.csv")