# AAWindow - small labelling application for amino acid sequence datasets

In [1]:
# standard imports 
import pandas as pd
import numpy as np
import AA_window as aaw
from StandardConfig import find_folderpath

In [2]:
path, sep = find_folderpath()
path, sep

('/home/freiherr/PycharmProjects/AAwindow', '/')

In [3]:
# get the pd.DataFrame
example_data = pd.read_excel("all_Annots_N_out.xlsx", "Preds").set_index("entry")
example_data.head(5)

Unnamed: 0_level_0,annotation_software,name,gene_name,organism,dataset,len_sequence,protein_domains,protein_category,len_signal,len_ecto,len_TMD,len_endo,start_pos_TMD,stop_pos_TMD,jmd_n,tmd,jmd_c,top_prediction (Annot. AI / DB),subcellular_location
entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
P48551,UniProt,INAR2_HUMAN,IFNAR2,HUMAN,SUBEXPERT,515.0,"['S', 'o', 'M', 'i']",ssTMD N-out (signal peptide),26.0,217.0,21.0,251.0,244.0,264.0,QESESAESAK,IGGIITVFLIALVLTSTIVTL,KWIGYICLR,2/8,Membrane;PM;Secreted
P48551,Phobius,INAR2_HUMAN,IFNAR2,HUMAN,SUBEXPERT,515.0,"['S', 'o', 'M', 'i']",ssTMD N-out (signal peptide),,,25.0,,244.0,268.0,QESESAESAK,IGGIITVFLIALVLTSTIVTLKWIG,YICLRNSLP,,
P48551,TMHMM,INAR2_HUMAN,IFNAR2,HUMAN,SUBEXPERT,515.0,"['S', 'o', 'M', 'i']",ssTMD N-out (signal peptide),,,23.0,,7.0,29.0,MLLSQN,AFIFRSLNLVLMVYISLVFGISY,DSPDYTDES,,
P48551,PolyPhobius,INAR2_HUMAN,IFNAR2,HUMAN,SUBEXPERT,515.0,"['S', 'o', 'M', 'i']",ssTMD N-out (signal peptide),26.0,216.0,22.0,251.0,243.0,265.0,GQESESAESA,KIGGIITVFLIALVLTSTIVTLK,WIGYICLRNS,,
P48551,SPOCTOPUS,INAR2_HUMAN,IFNAR2,HUMAN,SUBEXPERT,515.0,"['S', 'o', 'M', 'i']",ssTMD N-out (signal peptide),30.0,214.0,21.0,250.0,245.0,266.0,ESESAESAKI,GGIITVFLIALVLTSTIVTLKW,IGYICLRNSL,,


## Generate the base label version

### for this we need to take one specific annotation from which positive and negative labels are generated

In [4]:
# create an "arithmetic mean" only DataFrame
example_data_am = example_data[example_data["annotation_software"]=="arithmetic mean"][["start_pos_TMD", "stop_pos_TMD"]]
example_data_am.head(5)

Unnamed: 0_level_0,start_pos_TMD,stop_pos_TMD
entry,Unnamed: 1_level_1,Unnamed: 2_level_1
P48551,244.0,266.0
Q86YL7,129.0,152.0
P35590,762.0,785.0
P09803,711.0,733.0
P19022,724.0,745.0


In [5]:
# sequences and IDs ("name" column) for the labelling algorithm are missing, so let's add them
get_sequences = pd.read_excel("UniProt_all_Nout_filtered.xlsx").set_index("entry")[["name","sequence"]]
get_sequences.head(5)

Unnamed: 0_level_0,name,sequence
entry,Unnamed: 1_level_1,Unnamed: 2_level_1
Q03157,APLP1_MOUSE,MGPTSPAARGQGRRWRPPPLPLLLPLSLLLLRAQLAVGNLAVGSPS...
Q06481,APLP2_HUMAN,MAATGTAAAAATGRLLLLLLVGLTAPALALAGYIEALAANAGTGFA...
P05067,A4_HUMAN,MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...
P30530,UFO_HUMAN,MAWRCPRMGRVPLAWCLALCGWACMAPRGTQAEESPFVGNPGNITG...
P35613,BASI_HUMAN,MAAALFVLLGFALLGTHGASGAAGFVQAPLSQQRWVGGSVELHCEA...


In [6]:
# use pd.concat with "axis=1" to fuse the two dataframes together (via "entry" index)
example_data_am_seq = pd.concat([example_data_am, get_sequences], axis=1).dropna()
example_data_am_seq.head(5)

Unnamed: 0_level_0,start_pos_TMD,stop_pos_TMD,name,sequence
entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P48551,244.0,266.0,INAR2_HUMAN,MLLSQNAFIFRSLNLVLMVYISLVFGISYDSPDYTDESCTFKISLR...
Q86YL7,129.0,152.0,PDPN_HUMAN,MWKVSALLFVLGSASLWVLAEGASTGQPEDDTETTGLEGGVAMPGA...
P35590,762.0,785.0,TIE1_HUMAN,MVWRVPPFLLPILFLASHVGAAVDLTLLANLRLTDPQRFFLTCVSG...
P09803,711.0,733.0,CADH1_MOUSE,MGARCRSFSALLLLLQVSSWLCQELEPESCSPGFSSEVYTFPVPER...
P19022,724.0,745.0,CADH2_HUMAN,MCRIAGALRTLLPLLAALLQASVEASGEIALCKTGFPEDVYSAVLS...


### generate the labels using AA_window.get_aa_window_df()

In [7]:
test_labels = aaw.get_aa_window_df(window_size = 4, df = example_data_am_seq , column_id = "name", 
                                   column_seq = "sequence", column_aa_position = "start_pos_TMD", start_pos = True)
test_labels.head(21)

Unnamed: 0_level_0,window_left,window_right,label,start_pos_TMD
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
INAR2_HUMAN__0,ESAK,IGGI,1,244
INAR2_HUMAN__-1,AESA,KIGG,0,243
INAR2_HUMAN__1,SAKI,GGII,0,245
INAR2_HUMAN__-2,SAES,AKIG,0,242
INAR2_HUMAN__2,AKIG,GIIT,0,246
INAR2_HUMAN__-3,ESAE,SAKI,0,241
INAR2_HUMAN__3,KIGG,IITV,0,247
PDPN_HUMAN__0,GLST,VTLV,1,129
PDPN_HUMAN__-1,DGLS,TVTL,0,128
PDPN_HUMAN__1,LSTV,TLVG,0,130


### Now only the first position has a positive label (=1), any window shift from the original annotation gets a negative label (=0)

### Now by using the original DataFrame, the allowed-labels can be expanded

In [8]:
test_labels_modified = aaw.modify_label_by_ident_column(df_label = test_labels, df_compare = example_data, column_id = "name", threshold = 2)
test_labels_modified

Unnamed: 0_level_0,window_left,window_right,label,start_pos_TMD
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
INAR2_HUMAN__0,ESAK,IGGI,1,244
INAR2_HUMAN__-1,AESA,KIGG,1,243
INAR2_HUMAN__1,SAKI,GGII,0,245
INAR2_HUMAN__-2,SAES,AKIG,0,242
INAR2_HUMAN__2,AKIG,GIIT,0,246
...,...,...,...,...
KLOTB_HUMAN__1,KKPL,IFLG,0,998
KLOTB_HUMAN__-2,LVQK,KPLI,0,995
KLOTB_HUMAN__2,KPLI,FLGC,0,999
KLOTB_HUMAN__-3,FLVQ,KKPL,0,994


### Here, a single label slice: "-1", "1" and "2" annotations were also accepted with a set threshold of 2

In [9]:
test_labels_modified[14:21]

Unnamed: 0_level_0,window_left,window_right,label,start_pos_TMD
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TIE1_HUMAN__0,DQQL,ILAV,1,762
TIE1_HUMAN__-1,LDQQ,LILA,1,761
TIE1_HUMAN__1,QQLI,LAVV,1,763
TIE1_HUMAN__-2,GLDQ,QLIL,0,760
TIE1_HUMAN__2,QLIL,AVVG,1,764
TIE1_HUMAN__-3,EGLD,QQLI,0,759
TIE1_HUMAN__3,LILA,VVGS,0,765


### Export DataFrame as Excel file

In [10]:
#test_labels_modified.to_excel(f"Output{sep}start_pos_labels_annot_thresh_2_ssTMD_N_out.xlsx")

### Analyze label DataFrames (positive labels)

In [11]:
#test_labels
aaw.get_aa_window_df.describe

average_positive    1.0 / 7
min                   1 / 7
max                   1 / 7
ID_count               2918
dtype: object

In [12]:
# test_labels_modified
aaw.modify_label_by_ident_column.describe

average_positive    1.94 / 7
min                    1 / 7
max                    4 / 7
ID_count                2918
dtype: object