# Modeling
### Author: Ehsan Gharib-Nezhad


In [2]:
# Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.probability import FreqDist
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import random

# |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
# Dataset 1: Human dataset
# |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

In [3]:
# Load datasets
human = pd.read_csv('../datasets/human_processed_dataset.csv')

In [4]:
human.head()

Unnamed: 0,kmers,class
0,at,0
1,tg,0
2,ga,0
3,ag,0
4,gg,0


In [5]:
human.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44118822 entries, 0 to 44118821
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   kmers   object
 1   class   int64 
dtypes: int64(1), object(1)
memory usage: 1009.8+ MB


# Modeling

---

We may want to test lots of different values of hyperparameters in our CountVectorizer.

In [6]:
# Select a percentage of the whole data randomly ___________________________________________
total_kmer_numbers = len(human['kmers'])
random_numbers = random.sample(range(total_kmer_numbers), int(total_kmer_numbers*.01) )
# image_path_list_1D_randomized = image_path_list_1D[[random_numbers[:]]]
# target_from_image_path_list_1D_randomized = [path_image.split(os.sep)[-2] 
#                                              for path_image in image_path_list_1D_randomized ]

In [7]:
X = human['kmers'].iloc[random_numbers]
y = human['class'].iloc[random_numbers]

In [8]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

## Baseline accuracy

We need to calculate baseline accuracy in order to tell if our model is better than null model.

In [9]:
pd.DataFrame(y_train).value_counts(normalize=True), pd.DataFrame(y_test).value_counts(normalize=True)

(class
 6        0.274687
 0        0.149346
 1        0.141271
 3        0.123141
 2        0.114664
 4        0.112570
 5        0.084321
 dtype: float64,
 class
 6        0.274684
 0        0.149348
 1        0.141277
 3        0.123138
 2        0.114662
 4        0.112567
 5        0.084324
 dtype: float64)

In [10]:
cv = CountVectorizer()
Xcv_train = cv.fit_transform(X_train)

In [11]:
print(Xcv_train)

  (0, 62596)	1
  (1, 59488)	1
  (2, 50196)	1
  (3, 66867)	1
  (4, 58197)	1
  (5, 48394)	1
  (6, 66897)	1
  (7, 46583)	1
  (8, 37063)	1
  (9, 28663)	1
  (10, 37807)	1
  (11, 62896)	1
  (12, 27722)	1
  (13, 47443)	1
  (14, 38551)	1
  (15, 39829)	1
  (16, 6206)	1
  (17, 56932)	1
  (18, 38726)	1
  (19, 56612)	1
  (20, 67713)	1
  (21, 1640)	1
  (22, 27721)	1
  (23, 56529)	1
  (24, 29395)	1
  :	:
  (295570, 24181)	1
  (295571, 12939)	1
  (295572, 20258)	1
  (295573, 59306)	1
  (295574, 36609)	1
  (295575, 11774)	1
  (295576, 15149)	1
  (295577, 61847)	1
  (295578, 69104)	1
  (295579, 32357)	1
  (295580, 24915)	1
  (295581, 19615)	1
  (295582, 46865)	1
  (295583, 70909)	1
  (295584, 25960)	1
  (295585, 24182)	1
  (295586, 28399)	1
  (295587, 28687)	1
  (295588, 37825)	1
  (295589, 48446)	1
  (295590, 49902)	1
  (295591, 10789)	1
  (295592, 45384)	1
  (295593, 26263)	1
  (295594, 63209)	1


In [12]:
# to convert sparse matrix to dense matrix
Xcv_train_df = pd.DataFrame(Xcv_train.todense(),
                           columns = cv.get_feature_names() )
Xcv_train_df.head()

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaa,aaaaaaaa,aaaaaaaag,aaaaaaac,aaaaaaag,...,ttttttcc,ttttttcct,ttttttcg,ttttttctc,ttttttctt,ttttttg,ttttttgc,ttttttgct,ttttttt,tttttttg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
Xcv_train_df.shape

(295595, 73084)

In [14]:
# Transform the test corpus.
Xcv_test = cv.transform(X_test)

In [15]:
# to convert sparse matrix to dense matrix
Xcv_test_df = pd.DataFrame(Xcv_test.todense(),
                           columns = cv.get_feature_names() )
Xcv_test_df.head()

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaa,aaaaaaaa,aaaaaaaag,aaaaaaac,aaaaaaag,...,ttttttcc,ttttttcct,ttttttcg,ttttttctc,ttttttctt,ttttttg,ttttttgc,ttttttgct,ttttttt,tttttttg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
Xcv_test_df.shape

(145593, 73084)

In [None]:
#The fit method of SVC class is called to train the algorithm on the training data, 
#which is passed as a parameter to the fit method
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(Xcv_train, y_train)

In [None]:
svclassifier.score(Xcv_train, y_train) , svclassifier.score(Xcv_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = svclassifier.predict(Xcv_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [17]:
#The fit method of KNN class is called to train the algorithm on the training data, 
#which is passed as a parameter to the fit method
from sklearn.neighbors import KNeighborsClassifier
KNNclassifier = KNeighborsClassifier()
KNNclassifier.fit(Xcv_train, y_train)

KNeighborsClassifier()

In [None]:
KNNclassifier.score(Xcv_train, y_train) , KNNclassifier.score(Xcv_test, y_test)

In [None]:
y_pred = KNNclassifier.predict(Xcv_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
#The fit method of KNN class is called to train the algorithm on the training data, 
#which is passed as a parameter to the fit method
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier()
RFC.fit(Xcv_train, y_train)

In [None]:
y_pred = RFC.predict(Xcv_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))