In [1]:
# Necessary for Google Colab
# Uncomment before first run, and then comment all again

# Potrebno za Google Colab
# Odkomentarisati komande ispod pre prvog pokretanja, a zatim ponovo zakomentarisati


#! git clone https://github.com/AAnzel/Master_rad.git
#! mkdir ../data
#! mv Master_rad/data/UniProt\ cist ../data/
#! pip install biopython
#! mkdir ../data/Rezultati_Uniprot_cist


In [2]:
import os
import sys
import re
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis


#### Short introduction for data used in this thesis
* All data was downloaded form official UniProt website (data base)
* Data represents FASTA formated proteinsfor organism Arabidopsis Thaliana
* Downloaded data is represented in 2 different .csv files: 1. Positive (Pozitivni) contains all proteins with experimentally verified glycosylation existence, 2. Negative (Negativni) proteins with experimentally verified glycosylation non-existence
* That was done using website query created with 'Advanced' button (next to search box)


In [3]:
#######################################################################################################################
#### THIS CELL IS NOT USED! IT SHOULD ONLY BE USED IF IT IS IMPORTANT TO FIND THE EXACT POSITION OF GLYCOSYLATION #####

# Ova celija je namenjena za koriscenje samo onda kada je potrebno masinskim ucenjem naci poziciju unutar proteina
# gde ce se potencijalno desiti glikozilacija, ili vratiti da takva pozicija ne postoji
# Samim tim, ova celija za sad NIJE POTREBNA, jer mi je za sada cilj samo napraviti model koji ce odredjivati
# da ce se mozda desiti glikozilacija na proteinu ili ne, tj. resavam klasifikacioni problem.
# Prethodni problem sa nalazenjem pozicije predstavlja regresioni problem
#######################################################################################################################

# Prolazak kroz pozitivne proteine i izvlacenje pozicija glikozilacije iz trece kolone,
# a zatim cuvanje tako kreiranog fajla sa izmenjenom trecom kolonom
# Treca kolona ce biti u obliku: pozicija[, pozicija]*
def sredi_pozicije_glikozilacije():
    
    df_poz = pd.read_csv (os.path.join ('..', '..', 'data', 'UniProt cist', 'Pozitivni', 'EXCEL.csv'), sep = '\t')
    kolone = df_poz.columns.values
    
    # Lista koja sadrzi niz pozicija za svaki protein u obliku pozicija[, pozicija]*
    lista_pozicija = []

    for niska in niz_niski:
        # U svakoj niski odvajam delove koji sadrze razlicite pozicije (jedan protein moze da ima vise pozicija na kojima se vrsi glikozilacija)
        niske = niska.split (';')

        # Lista koja sadrzi niz pozicija za jedan protein
        tmp_lista = []

        # Prolazim kroz tako dobijenu listu niski, izvlacim pozicije i kreiram listu pozicija za sve proteine
        for niska_poz in niske:

            # Potrebno ovako jer postoji i unos (moza i vise njih) koji u sebi sadrzi ';' te se pojavljuje string koji predstavlja deo unosa koji ne sadrzi nikakav broj te se javlja greska
            tmp_lista.append ([int (c) for c in niska_poz.split() if c.isdigit()][0] if len ([int (c) for c in niska_poz.split() if c.isdigit()]) > 0 else '')

        lista_pozicija.append (', '.join (str(d) for d in tmp_lista))
    
    # Kreiranje Series objekta koji ce da predstavlja novu trecu kolonu
    nova_kolona = pd.Series (lista_pozicija, name = 'Glycosylation positions')
    
    # Ostaje jos da se nova kolona sjedini sa prethodnim podacima iz prve dve kolone
    # Pravimo novi DataFrame koji kasnije cuvamo kao novi .csv fajl

    df_novi_poz = pd.DataFrame(pd.concat ([df_poz['Entry'], df_poz['Entry name'], nova_kolona], axis = 1), columns = [kolone[0], kolone[1], 'Glycosylation positions'])
    df_novi_poz.to_csv (os.path.join ('..', '..', 'data', 'UniProt cist', 'Pozitivni', 'POZ_EXCEL_izmenjen.csv'), sep = '\t')


# 1. Data curation
## 1.1 Eliminating all proteins that contain unknown (undefined) amino-acid (by traversing all FASTA sequences)

In [4]:
# Undefined amino-acids are represented with 'X' character
# This function traversing two times through FASTA file (not very efficient)
# During first traverse it remembers indices of proteins which contain the 'X' amino-acid. On second traverse it eliminates those proteins and creates new files

# Neke FASTA sekence mogu sadrzati nepoznate amino-kiseline ili mesto gde moze doci bilo koja amino-kiselina (u oba slucaja se obelezava sa 'X')
# Potrebno je dakle eleminisati takve sekvence, a samim tim i takve proteine iz baze podataka
# Specijalno, u mom slucaju se to desilo kod skupa 'negativnih' proteina, ali pravim genericku funkciju koja to moze da obradi

# Prvo se prolazi jednom kroz FASTA fajl i pamte redni brojevi proteina koji sadrze nepoznate amino-kiselina
# Zatim se na osnovu tako dobijene liste rednih brojeva, uklanjaju proteini iz .csv i .fasta fajla i kreiraju novi fajlovi bez njih
def eliminisi_nepoznate (putanja, klasa):
    
    # Prvo ucitavamo FASTA fajl
    ostatak_ulaza_fasta = 'POZ_FASTA.fasta' if klasa == 1 else 'NEG_FASTA.fasta'
    fasta_niske = SeqIO.parse (os.path.join (putanja, ostatak_ulaza_fasta), 'fasta')
    
    i = 0
    lista_rednih_br = []
    
    for fasta_niska in fasta_niske:
        sekvenca = str(fasta_niska.seq)
        
        if 'X' in sekvenca:
            lista_rednih_br.append(i)
        
        i += 1
    
    # Ispisivanje liste za potrebe debagovanja
    print (lista_rednih_br)
    
    # Cuvanje fajlova sa novim imenom oblika: OB_staroime, gde je OB od obradjeno
    
    ostatak_izlaza_fasta = 'OB_POZ_FASTA.fasta' if klasa == 1 else 'OB_NEG_FASTA.fasta'
    
    with open(os.path.join (putanja, ostatak_izlaza_fasta), 'w') as fajl_izlaz:
        with open(os.path.join (putanja, ostatak_ulaza_fasta), 'r') as fajl_ulaz:
                
                j = 0
                for fasta_niska in SeqIO.parse(fajl_ulaz, "fasta"):
                    # Ako smo dosli do sekvence koja sadrzi nepoznatu amino-kiselinu, onda je ne upisujemo u izlaz
                    if j in lista_rednih_br:
                        j += 1
                        continue
                        
                    else:
                        j += 1
                        r = SeqIO.write(fasta_niska, fajl_izlaz, 'fasta')
                        
                        if r != 1:
                            print('Greska pri upisivanju sekvence ' + fasta_niska.id + ' na poziciji ' + j)
    
    
    # Isto ovo je potrebno i za .csv datoteke
    ostatak_ulaza_csv = 'POZ_EXCEL.csv' if klasa == 1 else 'NEG_EXCEL.csv'
    df_obradjen = pd.read_csv (os.path.join (putanja, ostatak_ulaza_csv), sep = '\t')
    
    # Izbacivanje redova tj. proteina koji sadrze nepoznate amino-kiseline
    df_obradjen = df_obradjen.drop (lista_rednih_br)
    df_obradjen = df_obradjen.reset_index (drop = True)
    
    # Cuvanje obradjenog DataFrame-a
    ostatak_izlaza_csv = 'OB_POZ_EXCEL.csv' if klasa == 1 else 'OB_NEG_EXCEL.csv'
    df_obradjen.to_csv(os.path.join (putanja, ostatak_izlaza_csv), sep = '\t')


In [5]:
# Now to test function above on both Positive and Negative data sets. There are two proteins in Negative data set that should be deleted

# Obrada polaznih podataka koriscenjem prethodne funkcije

poz_putanja = os.path.join ('..', '..', 'data', 'UniProt cist', 'Pozitivni')
neg_putanja = os.path.join ('..', '..', 'data', 'UniProt cist', 'Negativni')

eliminisi_nepoznate (poz_putanja, 1)
eliminisi_nepoznate (neg_putanja, 0)


[]
[8277, 8625]


## 1.2 Generating different physico-chemical attributes for each and every protein
BioPython is used to generate first 6 attributes. The rest is generated using manual on: https://github.com/SBRG/Protein_ML

In [6]:
# List is generated for every attribute and then converted into Series object
# Then all Series objects are combined into one DataFrame which represents one data set (Positive or Negative)
# This is not very efficient but it is extremly straightforward and easy to follow

# Obrada FASTA fajlova, tj. citavih sekvenci proteina zarad dobijanja fizicko-hemijskih karakteristika svakog od njih pomocu BioPython biblioteke
# Za pocetak, na osnovu ovih karakteristika cu probati da saznam da li ce protein biti glikozilizovan
# ili ne. Dalji rad bi mogao da bude nalazenje pozicija (prozora) na kojima se desava glikozilacija

# Kreiramo liste vrednosti za svaku od osobina koje mozemo da sracunamo. Nakon sracunavanja, kreiramo
# Series objekte koje zatim nadovezujemo i pravimo novi DataFrame objekat za cuvanje
# Nije najefikasniji nacin dobijanja rezultata (u Python smislu), ali je najpregledniji


def napravi_osobine (putanja, indeksi, klasa):
    
    niz_osobina_0 = []  # Molekularna tezina
    niz_osobina_1 = []  # Gravy
    niz_osobina_2 = []  # Aromaticnost
    niz_osobina_3 = []  # Indeks nestabilnosti
    niz_osobina_4 = []  # Udeo amino-kiselina
    niz_osobina_5 = []  # Isoelektricna tacka
    niz_osobina_6 = []  # Udeo sekundarne strukture

    niz_osobina_7 = []  # Naelektrisanje 
    niz_osobina_8 = []  # Apsolutno naelektrisanje
    niz_osobina_9 = []  # Prosecno naelektrisanje
    niz_osobina_10 = []  # Apsolutno prosecno naelektrisanje

    niz_osobina_11 = []  # Udeo alifatickih amino-kiselina
    niz_osobina_12 = []  # Udeo nenaelektrisanih polarnih amino-kiselina
    niz_osobina_13 = []  # Udeo polarnih amino-kiselina
    niz_osobina_14 = []  # Udeo hidrofobnih amino-kiselina
    niz_osobina_15 = []  # Udeo pozitivnih amino-kiselina
    niz_osobina_16 = []  # Udeo sumpurnih amino-kiselina
    niz_osobina_17 = []  # Udeo negativnih amino-kiselina
    niz_osobina_18 = []  # Udeo amidnih amino-kiselina
    niz_osobina_19 = []  # Udeo alkoholnih amino-kiselina
    
    niz_osobina_20 = []  # Postojanje Asn–X–Ser/Thr (N-X-S/T) trojke AK, gde je X bilo koja AK sem prolina (Pro) - vrednosti 0 ili 1
    niz_osobina_21 = []  # Tacan broj Asn–X–Ser/Thr (N-X-S/T) trojki AK, gde je X bilo koja AK sem prolina (Pro) - vrednosti 0, 1, 2, 3...
    
    reg_izraz = r'N[^P][ST][^P]'  # Kreiranje regularnog izraza koji definise N-vezanu glikozilaciju
    
    # Prvo ucitavamo FASTA fajl
    ostatak_putanje = 'OB_POZ_FASTA.fasta' if klasa == 1 else 'OB_NEG_FASTA.fasta'
    
    with open(os.path.join (putanja, ostatak_putanje), 'r') as fajl_ulaz:
        for fasta_niska in SeqIO.parse(fajl_ulaz, "fasta"):

            sekvenca = str(fasta_niska.seq)
            analiza_sekvence = ProteinAnalysis (sekvenca)

            niz_osobina_0.append (analiza_sekvence.molecular_weight())
            niz_osobina_1.append (analiza_sekvence.gravy())
            niz_osobina_2.append (analiza_sekvence.aromaticity())
            niz_osobina_3.append (analiza_sekvence.instability_index())
            udeo_amino_kiselina = analiza_sekvence.get_amino_acids_percent()
            niz_osobina_4.append (udeo_amino_kiselina)
            niz_osobina_5.append (analiza_sekvence.isoelectric_point())
            niz_osobina_6.append (analiza_sekvence.secondary_structure_fraction())

            naelektrisanje = sekvenca.count ('K') + sekvenca.count ('R') - sekvenca.count ('D') - sekvenca.count ('E')
            niz_osobina_7.append (naelektrisanje)
            niz_osobina_8.append (abs (naelektrisanje))
            niz_osobina_9.append (naelektrisanje / len (sekvenca))
            niz_osobina_10.append (abs (naelektrisanje / len (sekvenca)))

            niz_osobina_11.append (sum ([udeo_amino_kiselina[ak] for ak in 'AGILPV']))
            niz_osobina_12.append (sum ([udeo_amino_kiselina[ak] for ak in 'STNQ']))
            niz_osobina_13.append (sum ([udeo_amino_kiselina[ak] for ak in 'QNHSTYCMW']))
            niz_osobina_14.append (sum ([udeo_amino_kiselina[ak] for ak in 'AGILPVF']))
            niz_osobina_15.append (sum ([udeo_amino_kiselina[ak] for ak in 'HKR']))
            niz_osobina_16.append (sum ([udeo_amino_kiselina[ak] for ak in 'CM']))
            niz_osobina_17.append (sum ([udeo_amino_kiselina[ak] for ak in 'DE']))
            niz_osobina_18.append (sum ([udeo_amino_kiselina[ak] for ak in 'NQ']))
            niz_osobina_19.append (sum ([udeo_amino_kiselina[ak] for ak in 'ST']))
            
            # Trazimo motiv u oba slucaja, jer postojanje motiva ne znaci i da je protein pozitivan
            # I negativni proteini mogu da imaju motiv, ali ne znaci da ce biti glikozilovani
            
            rez_lista = re.findall (reg_izraz, sekvenca)
            duzina_liste = len (rez_lista)
            niz_osobina_20.append (int(duzina_liste > 0))
            niz_osobina_21.append (duzina_liste)

    
    # Za prosledjen FASTA fajl su napravljene osobine. Treba samo paziti pri ubacivanju frekvencije
    # amino-kiselina (jer je to mapa) i secondary_structure_fraction (jer je to trojka), dakle za njih
    # treba napraviti posebne kolone tj. treba ih izdeliti

    # Sredjivanje frekvencija amino-kiselina (20 novih kolona, za svaku amino-kiselinu) i
    # secondary_structure_fraction (3 nove kolone, za svaku osobinu).
    
    mapa_finalna = {}

    mapa_finalna['Molekularna tezina'] = niz_osobina_0
    mapa_finalna['Gravy'] = niz_osobina_1
    mapa_finalna['Aromaticnost'] = niz_osobina_2
    mapa_finalna['Indeks nestabilnosti'] = niz_osobina_3

    # Za svaku amino-kiselinu po nova kolona sa njenim udelom u celoj sekvenci
    for ak in list(niz_osobina_4[0].keys()):
        
        lista_udela_za_jednu_ak = []
        for mapa in niz_osobina_4:
            lista_udela_za_jednu_ak.append (mapa[ak])
        
        mapa_finalna['Udeo "' + str(ak) + '" u proteinu'] = lista_udela_za_jednu_ak

    mapa_finalna['Izoelektricna tacka'] = niz_osobina_5
    
    mapa_finalna['Udeo ak. u heliksu'] = []
    mapa_finalna['Udeo ak. u zavoju'] = []
    mapa_finalna['Udeo ak. u ravni'] = []
    
    for trojka in niz_osobina_6:
        mapa_finalna['Udeo ak. u heliksu'].append (trojka[0])
        mapa_finalna['Udeo ak. u zavoju'].append (trojka[1])
        mapa_finalna['Udeo ak. u ravni'].append (trojka[2])

    mapa_finalna['Naelektrisanje'] = niz_osobina_7
    mapa_finalna['Apsolutno naelektrisanje'] = niz_osobina_8
    mapa_finalna['Prosecno naelektrisanje'] = niz_osobina_9
    mapa_finalna['Apsolutno prosecno naelektrisanje'] = niz_osobina_10

    mapa_finalna['Udeo alifatickih ak.'] = niz_osobina_11
    mapa_finalna['Udeo nenaelektrisanih polarnih ak.'] = niz_osobina_12
    mapa_finalna['Udeo polarnih ak.'] = niz_osobina_13
    mapa_finalna['Udeo hidrofobnih ak.'] = niz_osobina_14
    mapa_finalna['Udeo pozitivnih ak.'] = niz_osobina_15
    mapa_finalna['Udeo sumpornih ak.'] = niz_osobina_16
    mapa_finalna['Udeo negativnih ak.'] = niz_osobina_17
    mapa_finalna['Udeo amidnih ak.'] = niz_osobina_18
    mapa_finalna['Udeo alkoholnih ak.'] = niz_osobina_19
    
    # Unos kolona o postojanju i broju motiva glikozilacije
    mapa_finalna['Postojanje glikozilacije'] = niz_osobina_20
    mapa_finalna['Broj mesta glikozilacije'] = niz_osobina_21
    
    mapa_finalna['Klasa'] = [klasa for i in range (len (niz_osobina_0))]

    # Kreiranje DataFrame-a
    df_finalni = pd.DataFrame (mapa_finalna, index = indeksi)
    
    # Cuvanje DataFrame-a u .csv formatu
    ime_izlaza = 'POZ_FINALNO.csv' if klasa == 1 else 'NEG_FINALNO.csv'
    df_finalni.to_csv (os.path.join (putanja, ime_izlaza), sep = '\t')
    
    return df_finalni


In [7]:
# Testing the function above

# Testiranje funkcija

poz_putanja = os.path.join ('..', '..', 'data', 'UniProt cist', 'Pozitivni')
df_poz = pd.read_csv (os.path.join (poz_putanja, 'OB_POZ_EXCEL.csv'), sep = '\t')
poz_indeksi = df_poz['Entry'].values
poz_klasa = 1

neg_putanja = os.path.join ('..', '..', 'data', 'UniProt cist', 'Negativni')
df_neg = pd.read_csv (os.path.join (neg_putanja, 'OB_NEG_EXCEL.csv'), sep = '\t')
neg_indeksi = df_neg['Entry'].values
neg_klasa = 0

# Pravljenje osobina i za pozitivne i za negativne skupove
df_poz_finalni = napravi_osobine (poz_putanja, poz_indeksi, poz_klasa)
df_neg_finalni = napravi_osobine (neg_putanja, neg_indeksi, neg_klasa)


In [8]:
# Result for positive data set
# Izgled pozitivnih
df_poz_finalni

Unnamed: 0,Molekularna tezina,Gravy,Aromaticnost,Indeks nestabilnosti,"Udeo ""A"" u proteinu","Udeo ""C"" u proteinu","Udeo ""D"" u proteinu","Udeo ""E"" u proteinu","Udeo ""F"" u proteinu","Udeo ""G"" u proteinu",...,Udeo polarnih ak.,Udeo hidrofobnih ak.,Udeo pozitivnih ak.,Udeo sumpornih ak.,Udeo negativnih ak.,Udeo amidnih ak.,Udeo alkoholnih ak.,Postojanje glikozilacije,Broj mesta glikozilacije,Klasa
Q9SSS9,25668.2531,-0.049145,0.051282,42.751709,0.098291,0.000000,0.047009,0.055556,0.038462,0.034188,...,0.316239,0.465812,0.119658,0.021368,0.102564,0.119658,0.158120,1,1,1
O04151,48526.7256,-1.009412,0.105882,31.455082,0.072941,0.007059,0.120000,0.122353,0.037647,0.051765,...,0.258824,0.367059,0.148235,0.016471,0.242353,0.051765,0.105882,1,3,1
Q9LV16,77740.2127,-0.460206,0.091043,42.948752,0.046990,0.017621,0.048458,0.080764,0.042584,0.058737,...,0.320117,0.409692,0.168869,0.041116,0.129222,0.064611,0.138032,1,1,1
Q9SE50,60458.5342,-0.509659,0.140152,26.184129,0.053030,0.015152,0.070076,0.051136,0.058712,0.089015,...,0.329545,0.433712,0.157197,0.030303,0.121212,0.077652,0.098485,1,3,1
Q9SYQ8,107596.6100,0.019184,0.077551,30.612449,0.047959,0.014286,0.039796,0.057143,0.046939,0.087755,...,0.319388,0.497959,0.114286,0.035714,0.096939,0.089796,0.134694,1,16,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9SU13,43447.8938,-0.076179,0.071960,38.811166,0.091811,0.007444,0.074442,0.029777,0.039702,0.059553,...,0.342432,0.459057,0.116625,0.029777,0.104218,0.076923,0.181141,1,4,1
Q3E9A4,53221.7554,-0.192275,0.100858,62.873433,0.060086,0.010730,0.036481,0.055794,0.049356,0.040773,...,0.334764,0.459227,0.156652,0.038627,0.092275,0.079399,0.122318,1,3,1
O80522,40476.1001,0.070000,0.105405,31.570270,0.062162,0.027027,0.045946,0.029730,0.059459,0.097297,...,0.345946,0.497297,0.108108,0.054054,0.075676,0.091892,0.127027,1,2,1
F4HTM3,97633.0234,-0.361620,0.119718,40.480423,0.036385,0.004695,0.061033,0.056338,0.049296,0.079812,...,0.330986,0.434272,0.143192,0.026995,0.117371,0.077465,0.130282,1,6,1


In [9]:
# Result for negative data set
# Izgled negativnih
df_neg_finalni

Unnamed: 0,Molekularna tezina,Gravy,Aromaticnost,Indeks nestabilnosti,"Udeo ""A"" u proteinu","Udeo ""C"" u proteinu","Udeo ""D"" u proteinu","Udeo ""E"" u proteinu","Udeo ""F"" u proteinu","Udeo ""G"" u proteinu",...,Udeo polarnih ak.,Udeo hidrofobnih ak.,Udeo pozitivnih ak.,Udeo sumpornih ak.,Udeo negativnih ak.,Udeo amidnih ak.,Udeo alkoholnih ak.,Postojanje glikozilacije,Broj mesta glikozilacije,Klasa
F4I443,79689.1581,-0.586975,0.061625,48.451849,0.070028,0.036415,0.061625,0.071429,0.025210,0.047619,...,0.359944,0.380952,0.156863,0.054622,0.133053,0.091036,0.147059,1,3,0
Q8RXD3,34806.9132,-0.550968,0.064516,54.291645,0.061290,0.029032,0.048387,0.116129,0.029032,0.051613,...,0.319355,0.406452,0.135484,0.048387,0.164516,0.083871,0.125806,0,0,0
P04778,28240.8071,0.020974,0.108614,24.637828,0.123596,0.003745,0.041199,0.056180,0.059925,0.123596,...,0.254682,0.565543,0.093633,0.029963,0.097378,0.052434,0.112360,0,0,0
Q9FLD5,60346.9487,-0.847276,0.095331,51.946342,0.042802,0.007782,0.062257,0.128405,0.042802,0.046693,...,0.284047,0.342412,0.200389,0.031128,0.190661,0.062257,0.120623,1,3,0
Q9M022,28049.1504,-0.364463,0.099174,52.350455,0.057851,0.041322,0.061983,0.070248,0.033058,0.045455,...,0.314050,0.404959,0.169421,0.061983,0.132231,0.057851,0.107438,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q8RXF8,72321.5750,-0.249228,0.081790,52.203410,0.086420,0.020062,0.066358,0.075617,0.046296,0.050926,...,0.280864,0.461420,0.126543,0.040123,0.141975,0.070988,0.123457,1,1,0
P93282,13248.7403,-0.405785,0.049587,44.333884,0.057851,0.016529,0.049587,0.049587,0.016529,0.016529,...,0.438017,0.371901,0.107438,0.041322,0.099174,0.066116,0.280992,1,1,0
Q9LJK3,34698.4268,0.412500,0.175676,31.229764,0.081081,0.020270,0.054054,0.043919,0.077703,0.040541,...,0.314189,0.489865,0.121622,0.057432,0.097973,0.030405,0.104730,0,0,0
Q9ASU8,38178.9715,0.116570,0.084302,43.015116,0.061047,0.031977,0.037791,0.063953,0.029070,0.072674,...,0.337209,0.479651,0.101744,0.072674,0.101744,0.063953,0.125000,0,0,0


In [10]:
# It can be clearly seen that there are much more proteins in negative than in positive data set (~ 7 times more)
# That is known as problem of unbalanced data

# Primetivo je da su klase nebalansirane, tj. imamo skoro 7 puta vise proteina koji su negativni
# Sa time se treba izboriti u delu treniranja modela masinskog ucenja


# Sledece treba vizualizovati podatke. Probati sa matricom korelacije
# Zatim se treba izboriti sa neizbalansiranoscu
# Na kraju natrenirati model i proveriti uspesnost


## 1.3 Visualizing data and attributes

In [11]:
# Krecemo sa ucitavanjem podataka i vizualizacijom istih
poz_putanja_ulaza = os.path.join ('..', '..', 'data', 'UniProt cist', 'Pozitivni', 'POZ_FINALNO.csv')
neg_putanja_ulaza = os.path.join ('..', '..', 'data', 'UniProt cist', 'Negativni', 'NEG_FINALNO.csv')

df_poz = pd.read_csv (poz_putanja_ulaza, sep = '\t', index_col = 0)
df_neg = pd.read_csv (neg_putanja_ulaza, sep = '\t', index_col = 0)


In [12]:
# Creating single DataFrame that contains both positive and negative proteins

# Kreiranje DataFrame-a koji sadrzi i pozitivne i negativne proteine
# verify_integrity parametar obezbedjuje proveru jedinstvenosti indeksa
# Samim tim se zapravo vidi da nema preklapanja iz pozitivnih i negativnih skupova podataka

df_sve = pd.concat([df_poz,df_neg], ignore_index=False, verify_integrity = True)
df_sve


Unnamed: 0,Molekularna tezina,Gravy,Aromaticnost,Indeks nestabilnosti,"Udeo ""A"" u proteinu","Udeo ""C"" u proteinu","Udeo ""D"" u proteinu","Udeo ""E"" u proteinu","Udeo ""F"" u proteinu","Udeo ""G"" u proteinu",...,Udeo polarnih ak.,Udeo hidrofobnih ak.,Udeo pozitivnih ak.,Udeo sumpornih ak.,Udeo negativnih ak.,Udeo amidnih ak.,Udeo alkoholnih ak.,Postojanje glikozilacije,Broj mesta glikozilacije,Klasa
Q9SSS9,25668.2531,-0.049145,0.051282,42.751709,0.098291,0.000000,0.047009,0.055556,0.038462,0.034188,...,0.316239,0.465812,0.119658,0.021368,0.102564,0.119658,0.158120,1,1,1
O04151,48526.7256,-1.009412,0.105882,31.455082,0.072941,0.007059,0.120000,0.122353,0.037647,0.051765,...,0.258824,0.367059,0.148235,0.016471,0.242353,0.051765,0.105882,1,3,1
Q9LV16,77740.2127,-0.460206,0.091043,42.948752,0.046990,0.017621,0.048458,0.080764,0.042584,0.058737,...,0.320117,0.409692,0.168869,0.041116,0.129222,0.064611,0.138032,1,1,1
Q9SE50,60458.5342,-0.509659,0.140152,26.184129,0.053030,0.015152,0.070076,0.051136,0.058712,0.089015,...,0.329545,0.433712,0.157197,0.030303,0.121212,0.077652,0.098485,1,3,1
Q9SYQ8,107596.6100,0.019184,0.077551,30.612449,0.047959,0.014286,0.039796,0.057143,0.046939,0.087755,...,0.319388,0.497959,0.114286,0.035714,0.096939,0.089796,0.134694,1,16,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q8RXF8,72321.5750,-0.249228,0.081790,52.203410,0.086420,0.020062,0.066358,0.075617,0.046296,0.050926,...,0.280864,0.461420,0.126543,0.040123,0.141975,0.070988,0.123457,1,1,0
P93282,13248.7403,-0.405785,0.049587,44.333884,0.057851,0.016529,0.049587,0.049587,0.016529,0.016529,...,0.438017,0.371901,0.107438,0.041322,0.099174,0.066116,0.280992,1,1,0
Q9LJK3,34698.4268,0.412500,0.175676,31.229764,0.081081,0.020270,0.054054,0.043919,0.077703,0.040541,...,0.314189,0.489865,0.121622,0.057432,0.097973,0.030405,0.104730,0,0,0
Q9ASU8,38178.9715,0.116570,0.084302,43.015116,0.061047,0.031977,0.037791,0.063953,0.029070,0.072674,...,0.337209,0.479651,0.101744,0.072674,0.101744,0.063953,0.125000,0,0,0


In [13]:
# Using permutation on rows to have some positive proteins and then some negative and so on. Result can be seen in last row (Klasa)

# Permutovanje vrsta ovakvog DataFrame-a tako da nemamo prvo pozitivne pa negativne, vec proizvoljno
df_sve = df_sve.reindex (np.random.permutation (df_sve.index))
df_sve


Unnamed: 0,Molekularna tezina,Gravy,Aromaticnost,Indeks nestabilnosti,"Udeo ""A"" u proteinu","Udeo ""C"" u proteinu","Udeo ""D"" u proteinu","Udeo ""E"" u proteinu","Udeo ""F"" u proteinu","Udeo ""G"" u proteinu",...,Udeo polarnih ak.,Udeo hidrofobnih ak.,Udeo pozitivnih ak.,Udeo sumpornih ak.,Udeo negativnih ak.,Udeo amidnih ak.,Udeo alkoholnih ak.,Postojanje glikozilacije,Broj mesta glikozilacije,Klasa
Q8GXB1,23245.1620,-0.469268,0.121951,33.818049,0.053659,0.009756,0.039024,0.068293,0.058537,0.087805,...,0.321951,0.434146,0.175610,0.024390,0.107317,0.073171,0.121951,1,1,0
Q9MBH1,18618.6711,-0.424848,0.096970,24.372121,0.048485,0.006061,0.072727,0.090909,0.030303,0.066667,...,0.315152,0.412121,0.133333,0.036364,0.163636,0.066667,0.121212,0,0,0
Q38954,61417.8054,0.541567,0.093697,27.341414,0.134583,0.013629,0.017036,0.025554,0.056218,0.095400,...,0.320273,0.574106,0.083475,0.040886,0.042589,0.057922,0.163543,1,2,0
P46286,27858.8027,-0.468992,0.058140,25.317054,0.108527,0.011628,0.031008,0.031008,0.031008,0.127907,...,0.255814,0.507752,0.228682,0.031008,0.062016,0.058140,0.085271,0,0,0
Q9LZ41,14331.0560,-0.728455,0.040650,37.244715,0.089431,0.000000,0.024390,0.056911,0.016260,0.016260,...,0.227642,0.398374,0.308943,0.016260,0.081301,0.073171,0.097561,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9ZUC1,40985.7353,0.043523,0.067358,30.304922,0.113990,0.007772,0.054404,0.056995,0.031088,0.072539,...,0.238342,0.531088,0.126943,0.015544,0.111399,0.051813,0.126943,1,2,0
Q9SQZ1,16035.5179,-0.572143,0.050000,49.127143,0.064286,0.000000,0.050000,0.078571,0.028571,0.057143,...,0.271429,0.400000,0.214286,0.050000,0.128571,0.064286,0.121429,0,0,0
Q9SGA0,48741.7777,-0.337413,0.057737,35.917136,0.055427,0.013857,0.073903,0.071594,0.025404,0.043880,...,0.327945,0.399538,0.152425,0.039261,0.145497,0.069284,0.161663,1,2,0
Q9SJQ1,73538.1400,-0.181101,0.056548,43.434970,0.058036,0.014881,0.049107,0.071429,0.029762,0.080357,...,0.291667,0.476190,0.133929,0.031250,0.120536,0.066964,0.144345,1,6,1


## Removal of proteins that don't contain N-glycosylation motif
### We are sure that N-glycoylation won't happen on those proteins

In [14]:
# Sanity check: positive data set should not contain proteins without aforementioned motif

# Provera da li postoji neki pozitivan a da ima 0 u koloni postojanje glikozilacije
# To bi znacilo da izvorni skup nije dobar

# Prvo proveravam ovo poslednje
if (0 in df_poz['Postojanje glikozilacije']):
  print ('Pozitivan skup SADRZI protein koji nema mesto glikozilacije')
else:
  print ('Pozitivan skup NE SADRZI protein koji nema mesto glikozilacije')

# Ovo je dobro


Pozitivan skup NE SADRZI protein koji nema mesto glikozilacije


In [15]:
# Actual removal

# Sada cemo izbaciti one proteine koji uopste nemaju mesto glikozilacije i prikazati ih
# Ovde je sustina 2. faze
df_sve = df_sve[df_sve['Postojanje glikozilacije'] != 0]
df_sve


Unnamed: 0,Molekularna tezina,Gravy,Aromaticnost,Indeks nestabilnosti,"Udeo ""A"" u proteinu","Udeo ""C"" u proteinu","Udeo ""D"" u proteinu","Udeo ""E"" u proteinu","Udeo ""F"" u proteinu","Udeo ""G"" u proteinu",...,Udeo polarnih ak.,Udeo hidrofobnih ak.,Udeo pozitivnih ak.,Udeo sumpornih ak.,Udeo negativnih ak.,Udeo amidnih ak.,Udeo alkoholnih ak.,Postojanje glikozilacije,Broj mesta glikozilacije,Klasa
Q8GXB1,23245.1620,-0.469268,0.121951,33.818049,0.053659,0.009756,0.039024,0.068293,0.058537,0.087805,...,0.321951,0.434146,0.175610,0.024390,0.107317,0.073171,0.121951,1,1,0
Q38954,61417.8054,0.541567,0.093697,27.341414,0.134583,0.013629,0.017036,0.025554,0.056218,0.095400,...,0.320273,0.574106,0.083475,0.040886,0.042589,0.057922,0.163543,1,2,0
Q9FF46,123340.1856,-0.164833,0.088368,40.256276,0.063120,0.033363,0.046889,0.049594,0.045086,0.074842,...,0.334536,0.448151,0.136159,0.061317,0.096483,0.083859,0.130748,1,10,0
Q66GR6,29727.5066,-0.245149,0.119403,39.953022,0.070896,0.003731,0.037313,0.055970,0.082090,0.070896,...,0.294776,0.485075,0.145522,0.011194,0.093284,0.070896,0.156716,1,1,0
Q9LUS2,119916.3526,-0.584022,0.053260,45.376584,0.071625,0.004591,0.068871,0.090909,0.027548,0.069789,...,0.308540,0.416896,0.134068,0.026630,0.159780,0.094582,0.142332,1,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9MAC8,87591.9120,-0.418561,0.087121,43.066540,0.078283,0.012626,0.064394,0.056818,0.030303,0.071970,...,0.329545,0.435606,0.142677,0.035354,0.121212,0.065657,0.142677,1,3,0
Q9ZUC1,40985.7353,0.043523,0.067358,30.304922,0.113990,0.007772,0.054404,0.056995,0.031088,0.072539,...,0.238342,0.531088,0.126943,0.015544,0.111399,0.051813,0.126943,1,2,0
Q9SGA0,48741.7777,-0.337413,0.057737,35.917136,0.055427,0.013857,0.073903,0.071594,0.025404,0.043880,...,0.327945,0.399538,0.152425,0.039261,0.145497,0.069284,0.161663,1,2,0
Q9SJQ1,73538.1400,-0.181101,0.056548,43.434970,0.058036,0.014881,0.049107,0.071429,0.029762,0.080357,...,0.291667,0.476190,0.133929,0.031250,0.120536,0.066964,0.144345,1,6,1


In [16]:
# Checking if everything went as expected

# Provera da li je odradjeno kako treba
if (0 in df_sve['Postojanje glikozilacije']):
  print ('Radnja NIJE uspesno izvrsena')
else:
  print ('Radnja JESTE uspesno izvrsena')

# Jos da izbacimo kolonu postojanja glikozilacije
df_sve = df_sve.drop (labels = ['Postojanje glikozilacije'], axis = 1)


Radnja JESTE uspesno izvrsena


In [17]:
# This is now the final DataFrame to be used in machine learning models
sve_putanja_izlaza = os.path.join ('..', '..', 'data', 'UniProt cist', 'SVE_FINALNO.csv')
df_sve.to_csv (os.path.join (sve_putanja_izlaza), sep = '\t')
df_sve

Unnamed: 0,Molekularna tezina,Gravy,Aromaticnost,Indeks nestabilnosti,"Udeo ""A"" u proteinu","Udeo ""C"" u proteinu","Udeo ""D"" u proteinu","Udeo ""E"" u proteinu","Udeo ""F"" u proteinu","Udeo ""G"" u proteinu",...,Udeo nenaelektrisanih polarnih ak.,Udeo polarnih ak.,Udeo hidrofobnih ak.,Udeo pozitivnih ak.,Udeo sumpornih ak.,Udeo negativnih ak.,Udeo amidnih ak.,Udeo alkoholnih ak.,Broj mesta glikozilacije,Klasa
Q8GXB1,23245.1620,-0.469268,0.121951,33.818049,0.053659,0.009756,0.039024,0.068293,0.058537,0.087805,...,0.195122,0.321951,0.434146,0.175610,0.024390,0.107317,0.073171,0.121951,1,0
Q38954,61417.8054,0.541567,0.093697,27.341414,0.134583,0.013629,0.017036,0.025554,0.056218,0.095400,...,0.221465,0.320273,0.574106,0.083475,0.040886,0.042589,0.057922,0.163543,2,0
Q9FF46,123340.1856,-0.164833,0.088368,40.256276,0.063120,0.033363,0.046889,0.049594,0.045086,0.074842,...,0.214608,0.334536,0.448151,0.136159,0.061317,0.096483,0.083859,0.130748,10,0
Q66GR6,29727.5066,-0.245149,0.119403,39.953022,0.070896,0.003731,0.037313,0.055970,0.082090,0.070896,...,0.227612,0.294776,0.485075,0.145522,0.011194,0.093284,0.070896,0.156716,1,0
Q9LUS2,119916.3526,-0.584022,0.053260,45.376584,0.071625,0.004591,0.068871,0.090909,0.027548,0.069789,...,0.236915,0.308540,0.416896,0.134068,0.026630,0.159780,0.094582,0.142332,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9MAC8,87591.9120,-0.418561,0.087121,43.066540,0.078283,0.012626,0.064394,0.056818,0.030303,0.071970,...,0.208333,0.329545,0.435606,0.142677,0.035354,0.121212,0.065657,0.142677,3,0
Q9ZUC1,40985.7353,0.043523,0.067358,30.304922,0.113990,0.007772,0.054404,0.056995,0.031088,0.072539,...,0.178756,0.238342,0.531088,0.126943,0.015544,0.111399,0.051813,0.126943,2,0
Q9SGA0,48741.7777,-0.337413,0.057737,35.917136,0.055427,0.013857,0.073903,0.071594,0.025404,0.043880,...,0.230947,0.327945,0.399538,0.152425,0.039261,0.145497,0.069284,0.161663,2,0
Q9SJQ1,73538.1400,-0.181101,0.056548,43.434970,0.058036,0.014881,0.049107,0.071429,0.029762,0.080357,...,0.211310,0.291667,0.476190,0.133929,0.031250,0.120536,0.066964,0.144345,6,1
