# DebruijnExtend Dataset Analysis
This python notebook can be used to perform an analysis of the the datasets being used to train and test.

In [1]:
#imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
# in-house
import sys
import os
sys.path.append(os.path.dirname("../../src"))
from src.debruijnextend.csvtohash import ProteinHash
import pickle

In [2]:
# PATHS
dataset = "../../data/primary2secondary.csv"
csv_column_names = ['sequence length', 'PDB name', 'Proten Sequence', '8 char', '3 char', '1 char']
seq_length_column = 0
pdb_name_column = 1
protein_column = 2
secondary_column = 4 


In [3]:
df = pd.read_csv(dataset, header=None, usecols=[seq_length_column,
                                                pdb_name_column, 
                                                protein_column, 
                                                secondary_column])
df.columns = ['sequence length', 'PDB name', 'Proten Sequence', '3 char']

In [4]:
# assort by length, drop duplicates proteins/chains (by PDB name)
df_sorted = df.sort_values(by = 'sequence length', ascending = False)
df_unique = df_sorted.drop_duplicates(subset = ["PDB name"])
df_unique = df_unique.drop_duplicates()
df_unique.duplicated(keep=False).sum()


0

In [5]:
print("Before filtering:")
df["sequence length"].describe()

Before filtering:


count    393732.000000
mean        260.212634
std         196.864409
min           3.000000
25%         131.000000
50%         223.000000
75%         336.000000
max        5037.000000
Name: sequence length, dtype: float64

In [6]:
print("After filtering:")
df_unique["PDB name"]
df_unique["sequence length"].describe()

After filtering:


count    139496.000000
mean        297.846863
std         209.231296
min           5.000000
25%         157.000000
50%         261.000000
75%         372.000000
max        5037.000000
Name: sequence length, dtype: float64

In [40]:
# output sequences into a fasta file.
def csv_to_fasta(panda_df, fasta_out):
    """
    Turns a pandas DF into a fasta file with
    sequences. 
    """
    with open(fasta_out, "w") as fasta_file:
        for index, row in panda_df.iterrows():
            sequence = row['Proten Sequence'].strip("'")
            name = row['PDB name'].strip("'")
            fasta_file.write(f">{name}\n{sequence}\n")

# create output file
csv_to_fasta(df_unique, "unique_proteins.fa")


## use CD-HIT to create clusters based on identities. 

Make sure to only run the following cell once as it takes time to run CD-HIT. This clusters the testing/training set based on different identity thresholds. 

In [52]:
#!cd-hit -i unique_proteins.fa -o unique_100.fa -c 1
# !cd-hit -i unique_proteins.fa -o unique_95.fa -c 0.95
# !cd-hit -i unique_proteins.fa -o unique_90.fa -c 0.9
# !cd-hit -i unique_proteins.fa -o unique_85.fa -c 0.85
# !cd-hit -i unique_proteins.fa -o unique_80.fa -c 0.80
# !cd-hit -i unique_proteins.fa -o unique_75.fa -c 0.75
# !cd-hit -i unique_proteins.fa -o unique_70.fa -c 0.70
# !cd-hit -i unique_proteins.fa -o unique_65.fa -c 0.65

Program: CD-HIT, V4.8.1, Apr 07 2021, 02:35:32
Command: cd-hit -i unique_proteins.fa -o unique_100.fa -c 1

Started: Tue Jan 11 23:45:40 2022
                            Output                              
----------------------------------------------------------------
Discarding invalid sequence or sequence without identifier and description!

>4UWE
MGDGGEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNRLCFLEPTSNAQNVPPDLAICCFTLEQSLSVRALQEMLANTVEAGVESSQGGGHRTLLYGHAILLRHAHSRMYLSCLTTSRSMTDKLAFDVGLQEDATGEACWWTMHPASKQRSEGEKVRVGDDLILVSVSSERYLHLSTASGELQVDASFMQTLWNMNPICSCCEEGYVTGGHVLRLFHGHMDECLTISAADSDDQRRLVYYEGGAVCTHARSLWRLEPLRISWSGSHLRWGQPLRIRHVTTGRYLALTEDQGLVVVDACKAHTKATSFCFRVSKEKLDTAPKRDVEGMGPPEIKYGESLCFVQHVASGLWLTYAAPDPKALRLGVLKKKAILHQEGHMDDALFLTRCQQEESQAARMIHSTAGLYNQFIKGLDSFSGKPRGSGPPAGPALPIEAVILSLQDLIGYFEPPSEELQHEEKQSKLRSLRNRQSLFQEEGMLSLVLNCIDRLNVYTTAAHFAEYAGEEAAESWKEIVNLLYELLASLIRGN****************************************************************************************************************

Discarding invalid sequence or sequence without identifier and description!

>5B16
MAKEPEETMPDKNEEEEEELLKPVWIRCTHSENYYSSDPMDQVGDSTVVGTSRLRDLYDKFEEELGSRQEK***************************************************************LWYNDPGQMNDGPLCKCSAKARRTGIRHSIYPGEEAIKPCRPMTNNAGRLFHYRITVSPPTNFLTDRPTVIEYDDHEYIFEGFSMFAHAPLTNIPLCKVIRFNIDYTIHFIEEMMPENFCVKGLELFSLFLFRDILELYDWNLKGPLFEDSPPCCPRFHFMPRFVRFLPDGGKEVLSMHQILLYLLRCSK******************************************************************************************************************************************TGIRSDVCQHAMMLPVLTHHIRYHQCLMHLDKLIGYTFQDRCLLQLAMTHPSHHLNFGMNPDHARNSLSNCGIRQPKYGDRKVHHMHMRKKGINTLINIMSRLGQDDPTPSRINHNERLEFLGDAVVEFLTSVHLYYLFPSLEEGGLATYRTAIVQNQHLAMLAKKLELDRFMLYAHGPDLCRESDLRHAMANCFQALIGAVYLEGSLEEAKQLFGRLLFNDPDLREVWLNYPLHPLQLQEPNTDRQLIETSPVLQKLTEFEEAIGVIFTHVRLLARAFTLRTVGFNHLTLGHNQRMEFLGDSIMQLVATEYLFIHFPDHHEGHLTLLRSSLVNNRTQAKVAEELGMQEYAITNDKTKRPVALRTKTLADLLQSFIAALYIDKDLEYVHTFMNVCFFPRLKEFILNQDWNDPKSQLQQCCLTLRTEGKEPDIPLYKTLQTVGPSHARTYTVA

Discarding invalid sequence or sequence without identifier and description!

>5I87
**************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
Discarding invalid sequence or sequence without identifier and description!

>2IV2
MKKVVTVCPYCASGCKINLVVDNGKIVRAEAAQGKTNQGTLCLKGYYGWDFINDTQILTPRLKTPMIRRQRGGKLEPVSWDEALNYVAERLSAIKEKYGPDAIQ

Discarding invalid sequence or sequence without identifier and description!

>5A8R
MDEKKLFLKALKKKFEGEDPDEKYTNFYCFGGWEQSARKKEFTEYAKKAAEKRGGIPFYNPDIGVPLGQRKLMAYRVSGTDAYVEGDDLHFVNNAAIQQMVDDIKRTVIVGMDTAHAVLEKRLGVEVTPETINEYMEAINHALPGGAVVQEHMVEVHPGLVEDCYAKIFTGDDNLADELDKRILIDINKEFPEEQAEQLKSYIGNRTYQVNRVPTIVVRTCDGGTVSRWSAMQIGMSFISAYKLCAGEAAIADFSYAAKHADVIEMGTIMPARRARGPNEPGGVAFGTFADIVQTSRVSDDPANVSLEVIAGAAALYDQVWLGSYMSGGVGFTQYATAAYTDDILDDFVYYGMEYVDDKYGICGTKPTMDVVRDISTEVTLYSLEQYEEYPTLLEDHFGGSQRAAVAAAAAGCSTAFATGNSNAGINGWYLSQILHKEAHSRLGFYGYDLQ*QCGASNSLSIRSDEGLIHELRGPNYPNYAMNVGHQPEYAGIAQAPHAARGDAFCTNPLIKVAFADKDLAFDFTSPRKSIAAGALREFMPEGERDLIIPAGK
Discarding invalid sequence or sequence without identifier and description!

>5G0R
MADKLFINALKKKFEESPEEKKTTFYTLGGWKQSERKTEFVNAGKEVAAKRGIPQYNPDIGTPLGQRVLMPYQVSTTDTYVEGDDLHFVNNAAMQQMWDDIRRTVIVGLNHAHAVIEKRLGKEVTPETITHYLETVNHAMPGAAVVQEHMVETHPALVADSYVKVFTGNDEIADEIDPAFVIDINKQFPEDQAETLKAEVGDGIWQVVRIPTIVSRTCDGATTSRWSAMQIGMSMISAYKQAAGEAATGDFAYAAKHAEVIHMGTYLPVRRAR

Discarding invalid sequence or sequence without identifier and description!

>1BLL
*TKGLVLGIYSKEKEEDEPQFTSAGENFNKLVSGKLREILNISGPPLKAGKTRTFYGLHEDFPSVVVVGLGKKTAGIDEQENWHEGKENIRAAVAAGCRQIQDLEIPSVEVDPCGDAQAAAEGAVLGLYEYDDLKQKRKVVVSAKLHGSEDQEAWQRGVLFASGQNLARRLMETPANEMTPTKFAEIVEENLKSASIKTDVFIRPKSWIEEQEMGSFLSVAKGSEEPPVFLEIHYKGSPNASEPPLVFVGKGITFDSGGISIKAAANMDLMRADMGGAATICSAIVSAAKLDLPINIVGLAPLCENMPSGKANKPGDVVRARNGKTIQVDNTDAEGRLILADALCYAHTFNPKVIINAATLTGAMDIALGSGATGVFTNSSWLWNKLFEASIETGDRVWRMPLFEHYTRQVIDCQLADVNNIGKYRSAGACTAAAFLKEFVTHPKWAHLDIAGVMTNKDEVPYLRKGMAGRPTRTLIEFLFRFSQDSA
Discarding invalid sequence or sequence without identifier and description!

>3J1O
QEQFVKRRRDMLEHINLAMNESSLALEFVSLLLSSVKESTGMSSMSPFLRKVVKPSSLNSDKIPYVAPTKKEYIELDILNKGWKLQSLNESKDLLRASFNKLSSILQNEHDYWNKIMQSISNKDVIFKIRDRTSGQKLLAIKYGYEDSGSTYKHDRGIANIRNNIESQNLDLIPHSSSVFKGTDFVHSVKKFLRVRIFTKIESEDDYILSGESVMDRDSESEEAETKDIRKQIQLLKKIIFEKELMYQIKKECALLISYGVSIENENKVIIELPNEKFEIELLSLDDDSIVNHEQDLPKINDKRANLMLVMLRLLLVVIFKKTLRSRISSPHGLINLN

Discarding invalid sequence or sequence without identifier and description!

>4KUF
MGSSHHHHHHSSGLVPRGSHMPFVNKQFNYKDPVNGVDIAYIKIPNAGQMQPVKAFKIHNKIWVIPERDTFTNPEEGDLNPPPEAKQVPVSYYDSTYLSTDNEKDNYLKGVTKLFERIYSTDLGRMLLTSIVRGIPFWGGSTIDTELKVIDTNSINVIQPDGSYRSEELNLVIIGPSADIIQFE*KSFGHEVLNLTRNGYGSTQYIRFSPDFTFGFEESLEVDTNPLLGAGKFATDPAVTLAHELIHAGHRLYGIAINPNRVFKVNTNAYYEMSGLEVSFEELRTFGGHDAKFIDSLQENEFRLYYYNKFKDIASTLNKAKSIVGTTASLQYMKNVFKEKYLLSEDTSGKFSVDKLKFDKLYKMLTEIYTEDNFVKFFKVLNRKTYLNFDKAVFKINIVPKVNYTIYDGFNLRNTNLAANFNGQNTEINNMNFTKLKNFTGLFEF
Discarding invalid sequence or sequence without identifier and description!

>4ELC
MGSSHHHHHHSSGLVPRGSHMPFVNKQFNYKDPVNGVDIAYIKIPNAGQMQPVKAFKIHNKIWVIPERDTFTNPEEGDLNPPPEAKQVPVSYYDSTYLSTDNEKDNYLKGVTKLFERIYSTDLGRMLLTSIVRGIPFWGGSTIDTELKVIDTNSINVIQPDGSYRSEELNLVIIGPSADIIQFE*KSFGHEVLNLTRNGYGSTQYIRFSPDFTFGFEESLEVDTNPLLGAGKFATDPAVTLAHELIHAGHRLYGIAINPNRVFKVNTNAYYEMSGLEVSFEELRTFGGHDAKFIDSLQENEFRLYYYNKFKDIASTLNKAKSIVGTTASLQYMKNVFKEKYLLSEDTSGKFSVDKLKFDKLYKMLTEIYTEDNFVKFFKVL

Discarding invalid sequence or sequence without identifier and description!

>3J9S
*MDVLYSLSKTLKDARDKIVEGTLYSNVSDLIQQFNQMIITMNGNEFQTGGIGNLPIRNWNFDFGLLGTTLLNLDANYVETARNTIDYFVDFVDNVCMDEMVRESQRNGIAPQSDSLIKLSGIKFKRINFDNSSEYIENWNLQNRRQRTGFTFHKPNIFPYSASFTLNRSQPAHDNLMGTMWLNAGSEIQVAGFDYSCAINAPANTQQFEHIVQLRRVLTTATITLLPDAERFSFPRVITSADGATTWYFNPVILRPNNVEIEFLLNGQIINTYQARFGTIIARNFDTIRLSFQLMRPPNMTPAVAALFPNAQPFEHHATVGLTLRIESAVCESVLADASETMLANVTSVRQEYAIPVGPVFPPGMNWTDLITNYSPSREDNLQRVFTVASIRSMLVK
Discarding invalid sequence or sequence without identifier and description!

>1QHD
*MDVLYSLSKTLKDARDKIVEGTLYSNVSDLIQQFNQMIITMNGNEFQTGGIGNLPIRNWNFDFGLLGTTLLNLDANYVETARNTIDYFVDFVDNVCMDEMVRESQRNGIAPQSDSLIKLSGIKFKRINFDNSSEYIENWNLQNRRQRTGFTFHKPNIFPYSASFTLNRSQPAHDNLMGTMWLNAGSEIQVAGFDYSCAINAPANTQQFEHIVQLRRVLTTATITLLPDAERFSFPRVITSADGATTWYFNPVILRPNNVEIEFLLNGQIINTYQARFGTIIARNFDTIRLSFQLMRPPNMTPAVAALFPNAQPFEHHATVGLTLRIESAVCESVLADASETMLANVTSVRQEYAIPVGPVFPPGMNWTDLITNYSPSREDNLQRVFTVASIRSMLVK
Discarding invalid sequence 

Discarding invalid sequence or sequence without identifier and description!

>2NN6
MGSSHHHHHHSQDPNSHMKETPLSNCERRFLLRAIEEKKRLDGRQTYDYRNIRISFGTDYGCCIVELGKTRVLGQVSCELVSPKLNRATEGILFFNLELSQMAAPAFEPGRQSDLLVKLNRLMERCLRNSKCIDTESLCVVAGEKVWQIRVDLHLLNHDGNIIDAASIAAIVALCHFRRPDVSVQGDEVTLYTPEERDPVPLSIHHMPICVSFAFFQQGTYLLVDPNEREERVMDGLLVIAMNKHREICTIQSSGGIMLLKDQVLRCSKIAGVKVAEITELILKALENDQKVRKEGGKFGFAESIANQRITAFKMEKAP**************************************
Discarding invalid sequence or sequence without identifier and description!

>5FEX
MWSHPQFEKASTGREILEKLERREFTREVLKEALSINDRGFNEALFKLADEIRRKYVGDEVHIRAIIEFSNVCRKNCLYCGLRRDNKNLKRYRMTPEEIVERARLAVQFGAKTIVLQSGEDPY*MPDVISDIVKEIKKMGVAVTLSLGEWPREYYEKWKEAGADRYLLRHETANPVLHRKLRPDTSFENRLNCLLTLKELGYETGAGSMVGLPGQTIDDLVDDLLFLKEHDFDMVGIGPFIPHPDTPLANEKKGDFTLTLKMVALTRILLPDSNIPATTAMGTIVPGGREITLRCGANVIMPNWTPSPYRQLYQLYPGKISVFEKDTASIPSVMKMIELLGRKPGRDWGGRKRVFETV
Discarding invalid sequence or sequence without identifier and description!

>5FES
MWSHPQFEKASTGREILEKLERR

Discarding invalid sequence or sequence without identifier and description!

>3LZB
GEAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMRKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQG**************************************
Discarding invalid sequence or sequence without identifier and description!

>6FK6
*MCGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSCFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKN
Discarding invalid sequence or sequence without identifier and description!

>2XO2
*AQVLRGTVTDFPGFDERADAETLRKA*KGLGTDEESILTLLTSRSNAQRQEISAAFKTLFGRDLLDDLKSELTGKFEKLIVAL*KPSRLYDAYEL

Discarding invalid sequence or sequence without identifier and description!

>1UN1
*ETAAFAALRKPVDVAFGRNYVPTWAFDHIKYFNGGNEIQLHLDKYTGTGFQSKGSYLFGHFSMQMKLVPGDSAGTVTAFYLSSQNSEHDEIDFEFLGNRTGQPYILQTNVFTGGKGDREQRIYLWFDPTKEFHYYSVLWNMYMIVFLVDDVPIRVFKNCKDLGVKFPFNQPMKIYSSLWNADDWATRGGLEKTDWSKAPFIASYRSFHIDGCEASVEAKFCATQGARWWDQKEFQDLDAFQYRRLSWVRQKYTIYNYCTDRSRYPSMPPECKRDRDI
Discarding invalid sequence or sequence without identifier and description!

>1UMZ
*ETAAFAALRKPVDVAFGRNYVPTWAFDHIKYFNGGNEIQLHLDKYTGTGFQSKGSYLFGHFSMQMKLVPGDSAGTVTAFYLSSQNSEHDEIDFEFLGNRTGQPYILQTNVFTGGKGDREQRIYLWFDPTKEFHYYSVLWNMYMIVFLVDDVPIRVFKNCKDLGVKFPFNQPMKIYSSLWNADDWATRGGLEKTDWSKAPFIASYRSFHIDGCEASVEAKFCATQGARWWDQKEFQDLDAFQYRRLSWVRQKYTIYNYCTDRSRYPSMPPECKRDRDI
Discarding invalid sequence or sequence without identifier and description!

>5WRK
QIGWRREGIKYRRNELFLDVLESVNLLMSPQGQVLSAHVSGRVVMKSYLSGMPECKFGMNDKIVIE*************************************RSISFIPPDGEFELMRYRTTKDIILPFRVIPLVREVGRTKLEVKVVIKSNFKPSLLAQKIEVRIPTPLNTSGVQVICMK

Discarding invalid sequence or sequence without identifier and description!

>5AXV
*ADVAGTSNRDFKGREQRLFNSEQYNYNNSLNGEVSVWVYAYYSDGSVLVINKNSQYKVGISETFKALKEYREGQHNDSYDEYEVNQSIYYPNGGDARKFHSNAKPRAIQIIFSPSVNVRTIKMAKGNAVSVPDEYLQRSHPWEATGIKYRKIKRDGEIVGYSHYFELPHEYNSISLAVSGVHKNPSSYNVGSAHNVMDVFQSCDLALRFCNRYWAELELVNHYISPNAYPYLDINNHSYGVALSNRQ
Discarding invalid sequence or sequence without identifier and description!

>5AXU
*ADVAGTSNRDFAGREQRLFNSEQYNYNNSLNGEVSVWVYAYYSDGSVLVINKNSQYKVGISETFKALKEYREGQHNDSYDEYEVNQSIYYPNGGDARKFHSNAKPRAIQIIFSPSVNVRTIKMAKGNAVSVPDEYLQRSHPWEATGIKYRKIKRDGEIVGYSHYFELPHEYNSISLAVSGVHKNPSSYNVGSAHNVMDVFQSCDLALRFCNRYWAELELVNHYISPNAYPYLDINNHSYGVALSNRQ
Discarding invalid sequence or sequence without identifier and description!

>5A99
*AYHGAPHEIRNRYQHDRALEILDRQYSRDSYIYAHLVLYMKDSSLQIIRAQNPRIISRSYNWDQLVLPNYRINDEKYYGRSELRHLRDGLLSDNGGRSQHDKGMNEPVSFQFIVQGDVDLGSVWFRVNKYNNISSSSFAMEAVSERAENYIGPLMRPIRYFDREMAWSYVGKFDGILFPCHPVISFAVQRANRDGAGLYNGENIYKTLIRLNDSPDLYAHYDDEETSVANYWTRFQYLYRT

Discarding invalid sequence or sequence without identifier and description!

>1TNV
*****************************************************************************************************************************************************************************************************************
Discarding invalid sequence or sequence without identifier and description!

>1O0E
LPEQIDWRKKGAVTPVKNQGSCGSCWAFSTVSTVESINQIRTGNLISLSEQELVDCDKKNHGCLGGAFVFAYQYIINNGGIDTQANYPYKAVQGPCQAASKVVSIDGYNGVPFCNE*ALKQAVAVQPSTVAIDASSAQFQQYSSGIFSGPCGTKLNHGVTIVGYQANYWIVRNSWGRYWGEKGYIRMLRVGGCGLCGIARLPYYPTKA
Discarding invalid sequence or sequence without identifier and description!

>2NA0
*GNIMDGKSVEELSSTECHQWYKKFMTECPSGQLTLYEFRQFFGLKNLSPWASQYVEQMFETFDFNKDGYIDFMEYEAALSLVLKGKVEQKLRWYFKLYDVDGNGCIDRDELLTIIRAIRAINPCSDSTMTAEEFTDTVFSKIDVNGDGELSLEEFMEGVQKDQMLLDTLTRSLDLTRIVRRLQNGEQDEEGASGRETEAAEADG
Discarding invalid sequence or sequence without identifier and description!

>3NXN
PQITLWKRPLVTIRIGGQLKEALLDTGA

Discarding invalid sequence or sequence without identifier and description!

>1HLM
*GATQSFQSVGDLTPAEKDLIRSTWDQLMTHRTGFVADVFIRIFHNDPTAQRKFPQMAGLSPAELRTSRQMHAHAIRVSALMTTYIDEMDTEVLPELLATLTRTHDKNHVGKKNYDLFGKVLMEAIKAELGVGFTKQVHDAWAKTFAIVQGVLITKHAS
Discarding invalid sequence or sequence without identifier and description!

>5EIL
MSKLGEMLI*AVLIGSKEAVKVLLDLGADPNASDEDGLTPLHAAAMAGHKEIVKLLLSKGADPNAKDSDGRTPLHYAAENGHKEIVKLLLSKGADPNAKDSDGRTPLHYAAENGHKEIVKLLLSKGADPNTSDSDGRTPLDLAREHGNEEIVKLLEKQG
Discarding invalid sequence or sequence without identifier and description!

>4BKC
GVFN*ETETTSVIPAARLFKAFILDGDNLFPKVAPQAISSVENIEGNGGPGTIKKISFPEGFPFKYVKDRVDEVDHTNFKYNYSVIEGGPIGDTLEKISNEIKIVATPDGGSILKISNKYHTKGDHEVKAEQVKASKEMGETLLRAVESYLLAHSDAYN
Discarding invalid sequence or sequence without identifier and description!

>4BKD
GVFN*ETETTSVIPAARLFKAFILDGDNLFPKVAPQAISSVENIEGNGGPGTIKKISFPEGFPFKYVKDRVDEVDHTNFKYNYSVIEGGPIGDTLEKISNEIKIVATPDGGSILKISNKYHTKGDHEVKAEQVKASKEMGETLLRAVESYLLAHSDAYN
Discarding i

Discarding invalid sequence or sequence without identifier and description!

>1GMZ
DLWQFGKMILKETGKLPFPYYVTYGCYCGVGGRGGPKDATDRCCFVHDCCYGKLTSCKPKTDRYSYSRKDGTIVCGENDPCRKEICECDKAAAVCFRENLDTYNKKYMSYLKSLCKK*ADDC
Discarding invalid sequence or sequence without identifier and description!

>2HYQ
*SLTHRKFGGSGGSPFSGLSSIAVRSGSYLDAIIIDGVHHGGSGGNLSPTFTFGSGEYISNMTIRSGDYIDNISFETNMGRRFGPYGGSGGSANTLSNVKVIQINGSAGDYLDSLDIYYEQY
Discarding invalid sequence or sequence without identifier and description!

>2HYR
*SLTHRKFGGSGGSPFSGLSSIAVRSGSYLDAIIIDGVHHGGSGGNLSPTFTFGSGEYISNMTIRSGDYIDNISFETNMGRRFGPYGGSGGSANTLSNVKVIQINGSAGDYLDSLDIYYEQY
Discarding invalid sequence or sequence without identifier and description!

>2GTY
*SLTHRKFGGSGGSPFSGLSSIAVRSGSYLDAIIIDGVHHGGSGGNLSPTFTFGSGEYISNMTIRSGDYIDNISFETNMGRRFGPYGGSGGSANTLSNVKVIQINGSAGDYLDSLDIYYEQY
Discarding invalid sequence or sequence without identifier and description!

>2GUC
*SLTHRKFGGSGGSPFSGLSSIAVRSGSYLDAIIIDGVHHGGSGGNLSPTFTFGSGEYISNMTIRSGDYIDNIS

Discarding invalid sequence or sequence without identifier and description!

>2BEQ
*LGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIK
Discarding invalid sequence or sequence without identifier and description!

>2R3C
*RMKQIEDKIEEIESKQKKIENEIARIKKLLQLTVWGIKQLQARIL
Discarding invalid sequence or sequence without identifier and description!

>2R5B
*RMKQIEDKIEEIESKQKKIENEIARIKKLLQLTVWGIKQLQARIL
Discarding invalid sequence or sequence without identifier and description!

>2R5D
*RMKQIEDKIEEIESKQKKIENEIARIKKLLQLTVWGIKQLQARIL
Discarding invalid sequence or sequence without identifier and description!

>3MGN
*RMKQIEDKIEEIESKQKKIENEIARIKKLLQLTVWGIKQLQARIL
Discarding invalid sequence or sequence without identifier and description!

>3L35
*RMKQIEDKIEEIESKQKKIENEIARIKKLLQLTVWGIKQLQARIL
Discarding invalid sequence or sequence without identifier and description!

>3L36
*RMKQIEDKIEEIESKQKKIENEIARIKKLLQLTVWGIKQLQARIL
Discarding invalid sequence or sequence without identifier an

Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 57M
Buffer          : 1 X 13M = 13M
Table           : 1 X 67M = 67M
Miscellaneous   : 1M
Total           : 140M

Table limit with the given memory limit:
Max number of representatives: 1422088
Max number of word counting entries: 82487722

comparing sequences from          0  to     137894
..........    10000  finished       4112  clusters
..........    20000  finished       8673  clusters
..........    30000  finished      13274  clusters
..........    40000  finished      17578  clusters
..........    50000  finished      21855  clusters
..........    60000  finished      26025  clusters
..........    70000  finished      30204  clusters
..........    80000  finished      34535  clusters
..........    90000  finished      39700  clusters
..........   100000  finished      45158  clusters
..........   110000  finished      50309  clusters
..........   120000  finished      55410  clusters
..........

# use clustered file to filter dataframe

In [32]:
# parse clustered sequences to obtain PDB IDs for consensus sequences.
# use the parsed PDB IDs to filter the pandas dataframe.
from tqdm import tqdm

def get_fasta_names(fasta):
    """
    retrieves a list of names from a input fasta file
    """
    name_array = {}
    with open(fasta) as fasta_file:
        fasta_file = fasta_file.readlines()
        for line in tqdm(fasta_file):
            if line[0] == ">":
                name = line.strip("\n").strip(">").strip("'").strip(" ")
                name = f"'{name}'"
            name_array[name] = 1
        return name_array

def filter_df(panda_df, name_dict):
    """
    returns DF with only rows that contain array info
    """
    for index, row in tqdm(panda_df.iterrows()):
        name = row['PDB name'].strip("'")
        if name not in name_dict.keys():
            panda_df.drop(index)
    return panda_df

# Use clustered proteins to create the testing/training dataset

In [53]:
name_dict = list(get_fasta_names('unique_100.fa').keys())
#print(name_dict)
filtered_df = df_unique[df_unique["PDB name"].isin(name_dict)]
df_unique["PDB name"]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 132716/132716 [00:00<00:00, 1532625.87it/s]


393731    '5J8V'
393725    '4UWE'
393720    '4UWA'
393719    '5NUG'
393709    '3KGV'
           ...  
530       '2P7R'
454       '1PLW'
455       '1PLX'
675       '4OLR'
689       '4ONK'
Name: PDB name, Length: 139496, dtype: object

In [54]:
filtered_df["sequence length"].describe()

count    66358.000000
mean       279.303641
std        197.416905
min         11.000000
25%        144.000000
50%        241.000000
75%        360.000000
max       5037.000000
Name: sequence length, dtype: float64

In [55]:
DATASET = filtered_df

train, test = train_test_split(DATASET, test_size=0.005)

In [56]:
train.describe()

Unnamed: 0,sequence length
count,66026.0
mean,279.214885
std,197.042572
min,11.0
25%,144.0
50%,241.0
75%,360.0
max,5037.0


# Save testing and training
The below saves the testing and training to the following formats:
* Training - pickle
* Training - CSV
* Testing - Fasta
* Testing - CSV

In [57]:
train.to_csv(f'TRAINING_100.csv', index=False)
test.to_csv(f'TESTING_100.csv', index=False)
# turn testing into fasta
csv_to_fasta(test, "TESTING_100.fa")

In [16]:
# turn training into pickle
kmer_size = 10
prothashOBJ = ProteinHash(f'TRAINING.csv', kmer_size)
prothashtable = prothashOBJ.construct_hash()
outfile = open(f'TRAINING.pickle','wb')
pickle.dump(prothashtable, outfile)
outfile.close()

609it [00:00, 3059.68it/s]

WORKING


138799it [00:56, 2457.66it/s]


# K-fold cross validation splits (not used)

In [16]:
# split the dataset into testing and training
### 1.A. create CSV outputs for all - testing_[N].csv, training_[N].csv
### 1.B. create hash table for training - training_[N].p
# Parameters
NUMBER_OF_FOLDS = 5
kmer_size = 3

kf = KFold(n_splits = NUMBER_OF_FOLDS) #, shuffle = True, random_state = 2)
for K, fold in enumerate(kf.split(filtered_df)):
    print(f"\n working on fold number: {K}")
    train, test = 0, 0
    train = df_unique.iloc[fold[0]]
    test =  df_unique.iloc[fold[1]]
    print(f" Creating training and testing CSVs..")
    train.to_csv(f'training_{K+1}.csv', index=False)
    test.to_csv(f'testing_{K+1}.csv', index=False)
    csv_to_fasta(test, f'testing_{K+1}.fa')
    # print(train["PDB name"].duplicated().sum())
    # print(test["PDB name"].duplicated().sum())
    # print(train["PDB name"].count() + test["PDB name"].count())
    # print(test["PDB name"].count())
    # print(df_unique["PDB name"].count())
    print(f"Creating a hash tables for the training CSV..")
    prothashOBJ = ProteinHash(f'training_{K+1}.csv', kmer_size)
    prothashtable = prothashOBJ.construct_hash()
    outfile = open(f'testing_{K+1}.pickle','wb')
    pickle.dump(prothashtable, outfile)
    


 working on fold number: 0
 Creating training and testing CSVs..
Creating a hash tables for the training CSV..
WORKING


31214it [00:13, 2231.88it/s]



 working on fold number: 1
 Creating training and testing CSVs..
Creating a hash tables for the training CSV..
WORKING


31214it [00:17, 1827.84it/s]



 working on fold number: 2
 Creating training and testing CSVs..
Creating a hash tables for the training CSV..
WORKING


31215it [00:17, 1817.00it/s]



 working on fold number: 3
 Creating training and testing CSVs..
Creating a hash tables for the training CSV..
WORKING


31215it [00:17, 1784.06it/s]



 working on fold number: 4
 Creating training and testing CSVs..
Creating a hash tables for the training CSV..
WORKING


31215it [00:17, 1786.25it/s]


In [149]:
# TODO:
## 1. turn the splits into K sets of testing/training
### 1.A. create CSV outputs for all - testing_[N].csv, training_[N].csv
### 1.B. create hash table for training - training_[N].p
### 1.C. create fasta file input for the testing - testing_[N].fasta

In [148]:
# TODO (Benchmarking):
## 1. create a testing/training split.
### 1.A. Create a training set (N=?)
### 1.B. Create a testing set that does not overlap training (N=100)
## 2. Benchmarking the tools.
### 2.A. Download several tools, ensure each tool can be downloaded/installed with a button push.
### 2.B. Automate the benchmarking with a BASH script or python script with subcalls.

Unnamed: 0,sequence length,PDB name,Proten Sequence,3 char
0,3,'1A30','EDL','CEC'
1,3,'1B05','KCK','CEC'
2,3,'1B0H','KAK','CEC'
4,3,'1B2H','KAK','CEC'
5,3,'1B32','KMK','CEC'
...,...,...,...,...
139491,166,'1G2I','MKVLFLTANEFEDVELIYPYHRLKEEGHEVYIASFERGTITGKHG...,'CEEEEECCCCECHHHHHHHHHHHHHHCCEEEEEECCCEEEECCCC...
139492,166,'1G5M','MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDDVEENRTEAPE...,'CCCCCCCCCCHHHHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCC...
139493,166,'1GJH','MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDDVEENRTEAPE...,'CCCCCCCCCCHHHHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCC...
139494,166,'1GNP','MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVV...,'CEEEEEEEECCCCCCHHHHHHHHHHCCCCCCCCCCCEEEEEEEEE...


In [None]:
k=4 : CCCCECCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHCEEEEECCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHCHHHHHHHHHHHHHHHHHHHCCCHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHHCC
k=7 : CCCCECCCCCCCCCCCEECCEHHHHCCCHHHHHHHHHHHHHHHHCCCCCCCCHHHHHHHHCCCCCCCCECCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCHHHHHHHHHCCCCCECCCCCCCCCCCCCCCCHHHHCCCEEEEECCCCCEEEEECCCCCCECCCCCCCCCHHHHCCCECCCCECCCCCECHHHHHHHHHHHHHHHHHHHHHHHHHCCCHHHHHCCCCCCCCCCCCHHHHHHHHHHHHHCCCCCCCCCCCHHHCCCCHHHCCHHHHHHHHHHHHHHHHHHHCCCCCCCCEEECCCCCCCCCCEEECCCCEEEEEECCCCCEEEHHHHCCHHHHCEECHHHCCCCCCCHHHHHHHHHHHHHHHHC
k=21: CCCCECCCCCCCCCCCEECCEHHHHCCCHHHHHHHHHHHHHHHHCCCCCCCCHHHHHHHHCCCCCCCCECCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCHHHHHHHHHCCCCCECCCCCCCCCCCCCCCCHHHHCCCEEEEECCCCCEEEEECCCCCCECCCCCCCCCHHHHCCCECCCCECCCCCECHHHHHHHHHHHHHHHHHHHHHHHHHCCCHHHHHCCCCCCCCCCCCHHHHHHHHHHHHHCCCCCCCCCCCHHHCCCCHHHCCHHHHHHHHHHHHHHHHHHHCCCCCCCCEEECCCCCCCCCCEEECCCCEEEEEECCCCCEEEHHHHCCHHHHCEECHHHCCCCCCCHHHHHHHHHHHHHHHHC
k=31: CCCCECCCCCCCCCCCEECCEHHHHCCCHHHHHHHHHHHHHHHHCCCCCCCCHHHHHHHHCCCCCCCCECCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCHHHHHHHHHCCCCCECCCCCCCCCCCCCCCCHHHHCCCEEEEECCCCCEEEEECCCCCCECCCCCCCCCHHHHCCCECCCCECCCCCECHHHHHHHHHHHHHHHHHHHHHHHHHCCCHHHHHCCCCCCCCCCCCHHHHHHHHHHHHHCCCCCCCCCCCHHHCCCCHHHCCHHHHHHHHHHHHHHHHHHHCCCCCCCCEEECCCCCCCCCCEEECCCCEEEEEECCCCCEEEHHHHCCHHHHCEECHHHCCCCCCCHHHHHHHHHHHHHHHHC