<a target="_blank" href="https://colab.research.google.com/github/AndreiSokolovskii/Protein_ddG_workshop/blob/main/aggregation_propensity/Aggregation_propensity_estimator.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
# @title ## Enter data
# @markdown Enter your amino acid sequence below in the form.
import os
import glob
if not os.path.isfile("INIT_STEP"):
  os.system('wget https://github.com/AndreiSokolovskii/Protein_ddG_workshop/raw/refs/heads/main/aggregation_propensity/lib.tar.xz')
  if not os.path.exists('tmp_fold'):
    os.mkdir('tmp_fold')
  os.system('cp lib.tar.xz tmp_fold')
  os.system('tar xJf tmp_fold/lib.tar.xz -C tmp_fold')
  os.system('cp run.sh tmp_fold')
  os.system('chmod u+x tmp_fold/agregation_estimator')
  os.system('chmod u+x tmp_fold/run.sh')
  os.system('touch INIT_STEP')

def get_hash(x):
  import hashlib
  return hashlib.sha1(x.encode()).hexdigest()

# @markdown ---
# @markdown ### Enter a sequence:
seq = "SGSDEDQSREQMASDVANNKSSLEDGCLSCGRKNPVSFHPLFEGGLCQTCRDRFLELFYMYDDDGYQSYCTVCCEGRELLLCSNTSCCRCFCVECLEVLVGTGTAAEAKLQEPWSCYMCLPQRCHGVLRRRKDWNVRLQAFFTSDT" # @param {type:"string", placeholder:"input sequence"}

# @markdown ---
FILE_with_SEQUENCES = False #@param {type:"boolean"}
jobname = "test"  #@param {type:"string"}
# @markdown ---
# @markdown ### Default values:
pH = 7.0 # @param {type:"number"}
temperature_in_Kelvin = 298 # @param {type:"number"}
ionic_strength_in_M = 0.02 # @param {type:"number"}
if FILE_with_SEQUENCES:
  from google.colab import files
  uploaded = files.upload()
  for fn in uploaded.keys():
    os.rename(fn, 'sequences.fa')

  import sys
  import glob

  fastas = 'sequences.fa'

  lines = open(fastas,'r').readlines()
  tmp_l = "".join("".join(lines).split('\n'))
  jobname += "_" + get_hash(tmp_l)[:5]
  if not os.path.exists(jobname):
    os.mkdir(jobname)
  os.system(f'cp sequences.fa {jobname}/')

  for i, line in enumerate(lines):
      if i % 2 == 0 and i > 1:
          index = str(int(i/2 - 1))
          fname = fastas[:-3]+'_'+index+'.fasta'
          with open(jobname + '/' + lines[i].strip()[1:] + '.fasta', 'w') as new_fa:
            new_fa.write(lines[i])
            new_fa.write(lines[i+1])
else:
  jobname += "_"+get_hash(seq)[:5]
  if not os.path.exists(jobname):
    os.mkdir(jobname)
  with open(jobname+'/seq.fasta', 'w') as new_fa:
    new_fa.write('>'+jobname+'\n')
    new_fa.write(seq)

os.system(f'cp tmp_fold/* {jobname}')

cmd = 'cd ' + jobname+ '; ' + './run.sh'
os.system(f'''sed -i '8s/ Temp / {int(temperature_in_Kelvin)} /' {jobname}/run.sh''')
os.system(f'''sed -i '8s/ pH / {str(pH).split('.')[0] + '.' + str(pH).split('.')[1]} /' {jobname}/run.sh''')
os.system(f'''sed -i '8s/ io / {str(ionic_strength_in_M)} /' {jobname}/run.sh''')

out_ae = os.popen(cmd).read()
import pandas as pd

agg = pd.read_csv(f"{jobname}/tango_aggregation.txt", header=0, sep="\s+")
agg_index = pd.read_csv(f"{jobname}/tango_index.txt", header=None, names=["description", "Sequence"], sep="\s+")
daggregation = pd.merge(agg, agg_index, on="Sequence")
df = daggregation
df['Aggregation_Postion'] = None
for file in glob.glob(f'{jobname}/*.txt'):
  name = file.split('/')[-1].split('.')[0]
  try:
    num = int(name)
    row = df.loc[df['Sequence'] == num].index[0]
    df.at[row, 'Aggregation_Postion'] = pd.read_csv(f'{jobname}/{name}.txt', header=0, sep="\s+")['Aggregation'].tolist()
  except:
    continue

In [None]:
# @title ## Plot results
def plot_aggregation_vs_res(df, num, jobname):
  import numpy as np
  import matplotlib.pyplot as plt
  y = np.array(df['Aggregation_Postion'][num])
  tot_agg = sum(y)
  x = np.array(range(len(y)))
  plt.plot(x, y,'-o')
  plt.xlabel('Res num')
  plt.ylabel('Aggregation')
  plot_titile = df['description'][num]
  legend = '$\sum_{i}^{} AP\_res_{i}$ = ' + str(tot_agg).split('.')[0] + '.' + str(tot_agg).split('.')[1][:2]
  plt.legend([legend], loc='best')
  plt.title(plot_titile)
  plt.savefig(f'''{jobname}/{df['description'][num]}.png''')
  plt.show()
for i in range(len(df)):
  plot_aggregation_vs_res(df, i, jobname)

In [None]:
#@title Download results

#@markdown Once this cell has been executed, resulted files

#@markdown will be automatically downloaded  to your computer with inputed sequence(s).
from google.colab import files
os.system(f'rm {jobname}.zip')
df.to_excel(f'{jobname}/results.xlsx')
os.system(f'''cd {jobname}; zip -q -r ../{jobname}.zip results.xlsx''')
ADD_FASTA_FILES = False #@param {type:"boolean"}
if ADD_FASTA_FILES:
  os.system(f'''cd {jobname}; zip -q -ur ../{jobname}.zip *.fasta''')
ADD_plot_FILES = False #@param {type:"boolean"}
if ADD_plot_FILES:
  os.system(f'''cd {jobname}; zip -q -ur ../{jobname}.zip *.png''')

ADD_tango_res_FILES = False #@param {type:"boolean"}
if ADD_tango_res_FILES:
  os.system(f'''cd {jobname}; zip -q -ur ../{jobname}.zip *.txt''')


files.download(f'{jobname}.zip')