# Classification Modeling: KNeighborsClassifier
### Author: Ehsan Gharib-Nezhad


In [1]:
# Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.probability import FreqDist
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import  LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

import random

# |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
# Dataset 1: Human dataset
# |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

In [2]:
# Load datasets
human = pd.read_csv('../datasets/human_kmer_dataset.csv')

In [3]:
human.head()

Unnamed: 0,sequence,class,sequence_length,A_count,T_count,G_count,C_count,2mer,3mer,4mer,5mer,6mer,7mer,8mer,9mer,10mer
0,ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA...,4,207,80,45,13,69,"['at', 'tg', 'gc', 'cc', 'cc', 'cc', 'ca', 'aa...","['atg', 'tgc', 'gcc', 'ccc', 'ccc', 'cca', 'ca...","['atgc', 'tgcc', 'gccc', 'cccc', 'ccca', 'ccaa...","['atgcc', 'tgccc', 'gcccc', 'cccca', 'cccaa', ...","['atgccc', 'tgcccc', 'gcccca', 'ccccaa', 'ccca...","['atgcccc', 'tgcccca', 'gccccaa', 'ccccaac', '...","['atgcccca', 'tgccccaa', 'gccccaac', 'ccccaact...","['atgccccaa', 'tgccccaac', 'gccccaact', 'cccca...","['atgccccaac', 'tgccccaact', 'gccccaacta', 'cc..."
1,ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG...,4,681,206,174,71,230,"['at', 'tg', 'ga', 'aa', 'ac', 'cg', 'ga', 'aa...","['atg', 'tga', 'gaa', 'aac', 'acg', 'cga', 'ga...","['atga', 'tgaa', 'gaac', 'aacg', 'acga', 'cgaa...","['atgaa', 'tgaac', 'gaacg', 'aacga', 'acgaa', ...","['atgaac', 'tgaacg', 'gaacga', 'aacgaa', 'acga...","['atgaacg', 'tgaacga', 'gaacgaa', 'aacgaaa', '...","['atgaacga', 'tgaacgaa', 'gaacgaaa', 'aacgaaaa...","['atgaacgaa', 'tgaacgaaa', 'gaacgaaaa', 'aacga...","['atgaacgaaa', 'tgaacgaaaa', 'gaacgaaaat', 'aa..."
2,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3,1686,476,483,393,334,"['at', 'tg', 'gt', 'tg', 'gt', 'tg', 'gg', 'gc...","['atg', 'tgt', 'gtg', 'tgt', 'gtg', 'tgg', 'gg...","['atgt', 'tgtg', 'gtgt', 'tgtg', 'gtgg', 'tggc...","['atgtg', 'tgtgt', 'gtgtg', 'tgtgg', 'gtggc', ...","['atgtgt', 'tgtgtg', 'gtgtgg', 'tgtggc', 'gtgg...","['atgtgtg', 'tgtgtgg', 'gtgtggc', 'tgtggca', '...","['atgtgtgg', 'tgtgtggc', 'gtgtggca', 'tgtggcat...","['atgtgtggc', 'tgtgtggca', 'gtgtggcat', 'tgtgg...","['atgtgtggca', 'tgtgtggcat', 'gtgtggcatt', 'tg..."
3,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3,1206,339,363,294,210,"['at', 'tg', 'gt', 'tg', 'gt', 'tg', 'gg', 'gc...","['atg', 'tgt', 'gtg', 'tgt', 'gtg', 'tgg', 'gg...","['atgt', 'tgtg', 'gtgt', 'tgtg', 'gtgg', 'tggc...","['atgtg', 'tgtgt', 'gtgtg', 'tgtgg', 'gtggc', ...","['atgtgt', 'tgtgtg', 'gtgtgg', 'tgtggc', 'gtgg...","['atgtgtg', 'tgtgtgg', 'gtgtggc', 'tgtggca', '...","['atgtgtgg', 'tgtgtggc', 'gtgtggca', 'tgtggcat...","['atgtgtggc', 'tgtgtggca', 'gtgtggcat', 'tgtgg...","['atgtgtggca', 'tgtgtggcat', 'gtgtggcatt', 'tg..."
4,ATGCAACAGCATTTTGAATTTGAATACCAGACCAAAGTGGATGGTG...,3,1437,417,406,326,288,"['at', 'tg', 'gc', 'ca', 'aa', 'ac', 'ca', 'ag...","['atg', 'tgc', 'gca', 'caa', 'aac', 'aca', 'ca...","['atgc', 'tgca', 'gcaa', 'caac', 'aaca', 'acag...","['atgca', 'tgcaa', 'gcaac', 'caaca', 'aacag', ...","['atgcaa', 'tgcaac', 'gcaaca', 'caacag', 'aaca...","['atgcaac', 'tgcaaca', 'gcaacag', 'caacagc', '...","['atgcaaca', 'tgcaacag', 'gcaacagc', 'caacagca...","['atgcaacag', 'tgcaacagc', 'gcaacagca', 'caaca...","['atgcaacagc', 'tgcaacagca', 'gcaacagcat', 'ca..."


# Modeling

---

We may want to test lots of different values of hyperparameters in our CountVectorizer.

<details><summary>Why do we need a pipeline to GridSearch over our CountVectorizer hyperparameters?</summary>
    
- The CountVectorizer is a transformer.
- Transformers have .fit() and .transform() methods, but cannot do .predict().
- In order to GridSearch over hyperparameters, we need some way to score our model performance.
- A pipeline stacks together one or more transformers with an estimator at the end. The estimator allows us to .predict() and get a score!
</details>

In [4]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split( [' '.join(x.replace(" ", "")[1:-1].split(',')) 
                                                      for x in human['10mer']] , # for converting the SINGLE string to an array of strings
                                                    human['class'],
                                                    test_size=0.25,
                                                    stratify=human['class'],
                                                    random_state=42)

## Baseline accuracy

We need to calculate baseline accuracy in order to tell if our model is better than null model.

In [5]:
pd.DataFrame(y_train).value_counts(normalize=True), pd.DataFrame(y_test).value_counts(normalize=True)

(class
 6        0.306545
 4        0.162253
 3        0.153425
 1        0.122070
 0        0.121157
 2        0.079756
 5        0.054795
 dtype: float64,
 class
 6        0.306849
 4        0.162557
 3        0.153425
 0        0.121461
 1        0.121461
 2        0.079452
 5        0.054795
 dtype: float64)

In [6]:
cvec = CountVectorizer(ngram_range=(4,4))

In [7]:
# Fit & transform the vectorizer on our training corpus.
Xcv_train = cvec.fit_transform(X_train)

In [8]:
# to convert sparse matrix to dense matrix
Xcv_train = pd.DataFrame(Xcv_train.todense(),
                           columns = cvec.get_feature_names() )
Xcv_train.head()

Unnamed: 0,aaaaaaaaaa aaaaaaaaaa aaaaaaaaac aaaaaaaaca,aaaaaaaaaa aaaaaaaaaa aaaaaaaaac aaaaaaaacc,aaaaaaaaaa aaaaaaaaaa aaaaaaaaac aaaaaaaacg,aaaaaaaaaa aaaaaaaaaa aaaaaaaaag aaaaaaaaga,aaaaaaaaaa aaaaaaaaaa aaaaaaaaag aaaaaaaagc,aaaaaaaaaa aaaaaaaaac aaaaaaaaca aaaaaaacag,aaaaaaaaaa aaaaaaaaac aaaaaaaacc aaaaaaaccc,aaaaaaaaaa aaaaaaaaac aaaaaaaacg aaaaaaacgc,aaaaaaaaaa aaaaaaaaag aaaaaaaaga aaaaaaagaa,aaaaaaaaaa aaaaaaaaag aaaaaaaaga aaaaaaagag,...,ttttttttcc tttttttcct ttttttccta tttttcctaa,ttttttttcc tttttttcct ttttttcctc tttttcctcc,ttttttttct tttttttctt ttttttcttg tttttcttga,ttttttttgc tttttttgct ttttttgctc tttttgctct,ttttttttgg tttttttggg ttttttggga tttttgggag,ttttttttgt tttttttgtt ttttttgtta tttttgttat,tttttttttc ttttttttca tttttttcac ttttttcacc,tttttttttc ttttttttcc tttttttcct ttttttcctc,tttttttttg ttttttttgg tttttttggg ttttttggga,tttttttttg ttttttttgt tttttttgtt ttttttgtta
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
Xcv_train.shape

(3285, 1298808)

In [10]:
# Transform the test corpus.
Xcv_test = cvec.transform(X_test)

In [11]:
# to convert sparse matrix to dense matrix
Xcv_test = pd.DataFrame(Xcv_test.todense(),
                           columns = cvec.get_feature_names() )
Xcv_test.head()

Unnamed: 0,aaaaaaaaaa aaaaaaaaaa aaaaaaaaac aaaaaaaaca,aaaaaaaaaa aaaaaaaaaa aaaaaaaaac aaaaaaaacc,aaaaaaaaaa aaaaaaaaaa aaaaaaaaac aaaaaaaacg,aaaaaaaaaa aaaaaaaaaa aaaaaaaaag aaaaaaaaga,aaaaaaaaaa aaaaaaaaaa aaaaaaaaag aaaaaaaagc,aaaaaaaaaa aaaaaaaaac aaaaaaaaca aaaaaaacag,aaaaaaaaaa aaaaaaaaac aaaaaaaacc aaaaaaaccc,aaaaaaaaaa aaaaaaaaac aaaaaaaacg aaaaaaacgc,aaaaaaaaaa aaaaaaaaag aaaaaaaaga aaaaaaagaa,aaaaaaaaaa aaaaaaaaag aaaaaaaaga aaaaaaagag,...,ttttttttcc tttttttcct ttttttccta tttttcctaa,ttttttttcc tttttttcct ttttttcctc tttttcctcc,ttttttttct tttttttctt ttttttcttg tttttcttga,ttttttttgc tttttttgct ttttttgctc tttttgctct,ttttttttgg tttttttggg ttttttggga tttttgggag,ttttttttgt tttttttgtt ttttttgtta tttttgttat,tttttttttc ttttttttca tttttttcac ttttttcacc,tttttttttc ttttttttcc tttttttcct ttttttcctc,tttttttttg ttttttttgg tttttttggg ttttttggga,tttttttttg ttttttttgt tttttttgtt ttttttgtta
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
Xcv_test.shape

(1095, 1298808)

In [None]:
#The fit method of KNN class is called to train the algorithm on the training data, 
#which is passed as a parameter to the fit method
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(Xcv_train, y_train)

In [None]:
y_pred = rfc.predict(Xcv_test) 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))