# Modeling
### Author: Ehsan Gharib-Nezhad


In [1]:
# Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.probability import FreqDist
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import random

# |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
# Dataset 1: Human dataset
# |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

In [2]:
# Load datasets
human = pd.read_csv('../datasets/human_processed_dataset.csv')

In [3]:
human.head()

Unnamed: 0,kmers,class
0,at,0
1,tg,0
2,ga,0
3,ag,0
4,gg,0


In [4]:
human.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44118822 entries, 0 to 44118821
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   kmers   object
 1   class   int64 
dtypes: int64(1), object(1)
memory usage: 1009.8+ MB


# Modeling

---

We may want to test lots of different values of hyperparameters in our CountVectorizer.

In [5]:
# Select a percentage of the whole data randomly ___________________________________________
total_kmer_numbers = len(human['kmers'])
random_numbers = random.sample(range(total_kmer_numbers), int(total_kmer_numbers*.01) )
# image_path_list_1D_randomized = image_path_list_1D[[random_numbers[:]]]
# target_from_image_path_list_1D_randomized = [path_image.split(os.sep)[-2] 
#                                              for path_image in image_path_list_1D_randomized ]

In [6]:
X = human['kmers'].iloc[random_numbers]
y = human['class'].iloc[random_numbers]

In [7]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

## Baseline accuracy

We need to calculate baseline accuracy in order to tell if our model is better than null model.

In [8]:
pd.DataFrame(y_train).value_counts(normalize=True), pd.DataFrame(y_test).value_counts(normalize=True)

(class
 6        0.273848
 0        0.149813
 1        0.141329
 3        0.122776
 2        0.115371
 4        0.112008
 5        0.084856
 dtype: float64,
 class
 6        0.273852
 0        0.149815
 1        0.141325
 3        0.122774
 2        0.115370
 4        0.112004
 5        0.084860
 dtype: float64)

In [9]:
cv = CountVectorizer()
Xcv_train = cv.fit_transform(X_train)

In [10]:
print(Xcv_train)

  (0, 53255)	1
  (1, 16786)	1
  (2, 22996)	1
  (3, 30953)	1
  (4, 23154)	1
  (5, 32933)	1
  (6, 62696)	1
  (7, 15855)	1
  (8, 19266)	1
  (9, 18918)	1
  (10, 23315)	1
  (11, 66406)	1
  (12, 31332)	1
  (13, 36844)	1
  (14, 39442)	1
  (15, 57100)	1
  (16, 32537)	1
  (17, 55538)	1
  (18, 2850)	1
  (19, 50205)	1
  (20, 7303)	1
  (21, 20222)	1
  (22, 69655)	1
  (23, 21619)	1
  (24, 21981)	1
  :	:
  (295570, 44067)	1
  (295571, 47464)	1
  (295572, 24351)	1
  (295573, 42098)	1
  (295574, 61404)	1
  (295575, 55926)	1
  (295576, 37486)	1
  (295577, 63609)	1
  (295578, 42327)	1
  (295579, 13642)	1
  (295580, 41715)	1
  (295581, 25106)	1
  (295582, 69359)	1
  (295583, 18976)	1
  (295584, 69358)	1
  (295585, 9322)	1
  (295586, 61261)	1
  (295587, 46219)	1
  (295588, 9322)	1
  (295589, 17006)	1
  (295590, 24350)	1
  (295591, 20222)	1
  (295592, 3011)	1
  (295593, 54224)	1
  (295594, 71517)	1


In [11]:
# to convert sparse matrix to dense matrix
Xcv_train_df = pd.DataFrame(Xcv_train.todense(),
                           columns = cv.get_feature_names() )
Xcv_train_df.head()

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaa,aaaaaaaaa,aaaaaaaac,aaaaaaaag,aaaaaaac,...,ttttttag,ttttttcc,ttttttcct,ttttttctt,ttttttg,ttttttga,ttttttgac,ttttttgt,ttttttgtt,ttttttt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
Xcv_train_df.shape

(295595, 73408)

In [13]:
# Transform the test corpus.
Xcv_test = cv.transform(X_test)

In [14]:
# to convert sparse matrix to dense matrix
Xcv_test_df = pd.DataFrame(Xcv_test.todense(),
                           columns = cv.get_feature_names() )
Xcv_test_df.head()

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaa,aaaaaaaaa,aaaaaaaac,aaaaaaaag,aaaaaaac,...,ttttttag,ttttttcc,ttttttcct,ttttttctt,ttttttg,ttttttga,ttttttgac,ttttttgt,ttttttgtt,ttttttt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
Xcv_test_df.shape

(145593, 73408)

In [None]:
#The fit method of SVC class is called to train the algorithm on the training data, 
#which is passed as a parameter to the fit method
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(Xcv_train, y_train)

In [None]:
svclassifier.score(Xcv_train, y_train) , svclassifier.score(Xcv_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = svclassifier.predict(Xcv_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
#The fit method of KNN class is called to train the algorithm on the training data, 
#which is passed as a parameter to the fit method
from sklearn.neighbors import KNeighborsClassifier
KNNclassifier = KNeighborsClassifier(n_neighbors=5)
KNNclassifier.fit(Xcv_train, y_train)

In [None]:
KNNclassifier.score(Xcv_train, y_train) , KNNclassifier.score(Xcv_test, y_test)

In [None]:
y_pred = KNNclassifier.predict(Xcv_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
#The fit method of KNN class is called to train the algorithm on the training data, 
#which is passed as a parameter to the fit method
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier()
RFC.fit(Xcv_train, y_train)

In [None]:
y_pred = RFC.predict(Xcv_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))