# DebruijnExtend Dataset Analysis
This python notebook can be used to perform an analysis of the the datasets being used to train and test.

In [1]:
#imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
# PATHS
dataset = "../data/primary2secondary.csv"
csv_column_names = ['sequence length', 'PDB name', 'Proten Sequence', '8 char', '3 char', '1 char']
seq_length_column = 0
pdb_name_column = 1
protein_column = 2
secondary_column = 4 


In [3]:
df = pd.read_csv(dataset, header=None, usecols=[seq_length_column,
                                                pdb_name_column, 
                                                protein_column, 
                                                secondary_column])
df.columns = ['sequence length', 'PDB name', 'Proten Sequence', '3 char']

In [4]:
# assort by length, drop duplicates proteins/chains (by PDB name)
df_sorted = df.sort_values(by = 'sequence length', ascending = False)
df_unique = df_sorted.drop_duplicates(subset = ["PDB name"])
df_unique = df_unique.drop_duplicates()
df_unique.duplicated(keep=False).sum()


0

In [5]:
print("Before filtering:")
df["sequence length"].describe()

Before filtering:


count    393732.000000
mean        260.212634
std         196.864409
min           3.000000
25%         131.000000
50%         223.000000
75%         336.000000
max        5037.000000
Name: sequence length, dtype: float64

In [6]:
print("After filtering:")
df_unique["sequence length"].describe()

After filtering:


count    139496.000000
mean        297.846863
std         209.231296
min           5.000000
25%         157.000000
50%         261.000000
75%         372.000000
max        5037.000000
Name: sequence length, dtype: float64

In [7]:
train, test = train_test_split(df_unique, test_size=0.2)

In [8]:
train.describe()

Unnamed: 0,sequence length
count,111596.0
mean,297.601025
std,208.861733
min,5.0
25%,157.0
50%,261.0
75%,373.0
max,5037.0


In [9]:
df_unique.to_csv(f'testingDS_{K+1}.csv', index=False)

NameError: name 'K' is not defined

In [17]:
# split the dataset into testing and training
### 1.A. create CSV outputs for all - testing_[N].csv, training_[N].csv
### 1.B. create hash table for training - training_[N].p
import sys
sys.path.append("/Users/dreyceyalbin/Dropbox/Fall2020-classes/Algorithms/project/DebruijnExtend/py_scripts")
from csvtohash import ProteinHash
import pickle
# Parameters
NUMBER_OF_FOLDS = 5
kmer_size = 3

kf = KFold(n_splits = NUMBER_OF_FOLDS) #, shuffle = True, random_state = 2)
for K, fold in enumerate(kf.split(df_unique)):
    print(f"\n working on fold number: {K}")
    train, test = 0, 0
    train = df_unique.iloc[fold[0]]
    test =  df_unique.iloc[fold[1]]
    print(f" Creating training and testing CSVs..")
    train.to_csv(f'training_{K+1}.csv', index=False)
    test.to_csv(f'testing_{K+1}.csv', index=False)
    # print(train["PDB name"].duplicated().sum())
    # print(test["PDB name"].duplicated().sum())
    # print(train["PDB name"].count() + test["PDB name"].count())
    # print(test["PDB name"].count())
    # print(df_unique["PDB name"].count())
    print(f"Creating a hash tables for the training CSV..")
    prothashOBJ = ProteinHash(f'training_{K+1}.csv', kmer_size)
    prothashtable = prothashOBJ.construct_hash()
    outfile = open(f'testing_{K+1}.pickle','wb')
    pickle.dump(prothashtable, outfile)
    

working on fold number: 0
 

 Creating training and testing CSVs..


0it [00:00, ?it/s]

 

 Creating a hash tables for the training CSV..


111597it [00:08, 13396.81it/s]


working on fold number: 1
 

 Creating training and testing CSVs..


0it [00:00, ?it/s]

 

 Creating a hash tables for the training CSV..


111598it [00:08, 13377.75it/s]


working on fold number: 2
 

 Creating training and testing CSVs..
 

 Creating a hash tables for the training CSV..


79256it [00:05, 13881.04it/s]

In [149]:
# TODO:
## 1. turn the splits into K sets of testing/training
### 1.A. create CSV outputs for all - testing_[N].csv, training_[N].csv
### 1.B. create hash table for training - training_[N].p
### 1.C. create fasta file input for the testing - testing_[N].fasta

In [148]:
# TODO (Benchmarking):
## 1. create a testing/training split.
### 1.A. Create a training set (N=?)
### 1.B. Create a testing set that does not overlap training (N=100)
## 2. Benchmarking the tools.
### 2.A. Download several tools, ensure each tool can be downloaded/installed with a button push.
### 2.B. Automate the benchmarking with a BASH script or python script with subcalls.

Unnamed: 0,sequence length,PDB name,Proten Sequence,3 char
0,3,'1A30','EDL','CEC'
1,3,'1B05','KCK','CEC'
2,3,'1B0H','KAK','CEC'
4,3,'1B2H','KAK','CEC'
5,3,'1B32','KMK','CEC'
...,...,...,...,...
139491,166,'1G2I','MKVLFLTANEFEDVELIYPYHRLKEEGHEVYIASFERGTITGKHG...,'CEEEEECCCCECHHHHHHHHHHHHHHCCEEEEEECCCEEEECCCC...
139492,166,'1G5M','MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDDVEENRTEAPE...,'CCCCCCCCCCHHHHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCC...
139493,166,'1GJH','MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDDVEENRTEAPE...,'CCCCCCCCCCHHHHHHHHHHHHHHCCCCCCCCCCCCCCCCCCCCC...
139494,166,'1GNP','MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVV...,'CEEEEEEEECCCCCCHHHHHHHHHHCCCCCCCCCCCEEEEEEEEE...
