## TDP43 preprocessing example

**Reference** : Bolognesi, B., Faure, A. J., Seuma, M., Schmiedel, J. M., Tartaglia, G. G., & Lehner, B. (2019, September 13). The mutational landscape of a prion-like domain. _Nature Communications_. Springer Science and Business Media LLC. [https://doi.org/10.1038/s41467-019-12101-z](https://doi.org/10.1038/s41467-019-12101-z)

In [1]:
# import moudle
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from openpyxl import Workbook

import urllib
import pandas as pd

In [2]:
# download raw dataset
url = "https://github.com/jbkinney/mavenn/blob/master/mavenn/examples/datasets/raw/tdp-43_raw.xlsx?raw=true"
excel_loc = "Data/rawdata.xlsx"
save_excel_loc = "Data/data.xlsx"
urllib.request.urlretrieve(url, excel_loc)

('Data/rawdata.xlsx', <http.client.HTTPMessage at 0x274706b3880>)

In [3]:
# reference sequence
seq = "GNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWGMMGMLASQQNQSGPSGNNQNQGNMQREPNQAFGSGNNS"
ref = list(seq)

In [4]:
# excel to dataframe
single_mut_df = pd.read_excel(excel_loc, sheet_name="1 AA change")
double_mut_df = pd.read_excel(excel_loc, sheet_name="2 AA change")

# delete stop codoon row
single_mut_df = single_mut_df.loc[single_mut_df["STOP"] == False]
double_mut_df = double_mut_df.loc[double_mut_df["STOP"] == False]

# select data coloumn
single_mut_df = single_mut_df[["toxicity", "sigma", "mut_code"]]
double_mut_df = double_mut_df[["toxicity_cond", "sigma_cond", "mut_code1", "mut_code2"]]

In [5]:
single_mut_df.head()

Unnamed: 0,toxicity,sigma,mut_code
0,0.03221,0.037438,G290N
1,-0.009898,0.038981,G290T
2,-0.010471,0.005176,G290R
3,0.030803,0.005341,G290S
4,-0.054716,0.035752,G290I


In [6]:
data_list = []
seq_list = []
for ind, y, dy, mstr in single_mut_df.itertuples():
    l = int(mstr[1:-1]) - 289
    if ref[l - 1] != mstr[0]:
        exit(-1)
    ref_seq = ref[:]
    ref_seq[l - 1] = mstr[-1]
    seq_list.append(''.join(ref_seq))
    data_list.append((mstr, y, dy))

In [7]:
double_mut_df.head()

Unnamed: 0,toxicity_cond,sigma_cond,mut_code1,mut_code2
4458,0.045753,0.025523,G290A,N291D
4459,-0.010535,0.022647,G290A,N291H
4460,0.003815,0.019993,G290A,N291I
4461,0.029509,0.025855,G290A,N291K
4462,0.011349,0.016745,G290A,N291S


In [8]:
for ind, y, dy, mstr, mstr1 in double_mut_df.itertuples():
    l = int(mstr[1:-1]) - 289
    l1 = int(mstr1[1:-1]) - 289
    if ref[l - 1] != mstr[0] or ref[l1 - 1] != mstr1[0]:
        exit(-1)
    ref_seq = ref[:]
    ref_seq[l - 1] = mstr[-1]
    ref_seq[l1 - 1] = mstr1[-1]
    seq_list.append(''.join(ref_seq))
    data_list.append((mstr, y, dy))

In [9]:
wb = Workbook()
ws = wb.active

ws.cell(row=1, column=1).value = "aaMutations"
ws.cell(row=1, column=2).value = "toxicity"
ws.cell(row=1, column=3).value = "dtoxicity"

for ind, (mstr, y, dy) in enumerate(data_list):
    ws.cell(row=ind+2, column=1).value = mstr
    ws.cell(row=ind+2, column=2).value = y
    ws.cell(row=ind+2, column=3).value = dy

wb.save(filename=save_excel_loc)

In [None]:
seqrecord_list = []
for seq, (mstr, y, dy) in zip(seq_list, data_list):
    seqrecord_list.append(SeqRecord(seq=Seq(seq), id=mstr, name="", description=""))

with open("Data/data.fasta", "w") as f:
    SeqIO.write(seqrecord_list, f, "fasta")

with open("Data/index.txt", "w") as f:
    f.write(str(290))