## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, make_scorer
import sqlite3
import datetime
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec

global_start = datetime.datetime.now()

## Read data into Pandas dataframe

In [None]:
conn = sqlite3.connect('final.sqlite')
data = pd.read_sql_query(""" SELECT * FROM Reviews """, conn)
conn.close()

print('Shape of our data : {}'.format(data.shape))
data.head(3)    

## Randomly Sample 30k  points from 364k points

In [None]:
sample_data = data.sample(n=30000, random_state=42).reset_index(drop=True)
del data # To free up the RAM
print('shape of our sampled data : {}'.format(sample_data.shape))
print('\n Distribution of class label : \n{}'.format(sample_data['Score'].value_counts(normalize=True)))
sample_data.head(3)

## Time Based Splitting
We arrange the sampled data in ascending order of **Time** column and split the data without shuffling such that:
* Train_data = First 70%
* Test_data = Last 30%

In [None]:
# Sort by Time
sample_data = sample_data.sort_values('Time').reset_index(drop=True)

X_text = sample_data.iloc[:,10] # Selecting the 'CleanedText' column
y = sample_data.iloc[:,6] # Selecting the 'Score' column
del sample_data # Free up the RAM

# Split the data
X_text_train, X_text_test, y_train, y_test = train_test_split(X_text, y, test_size=0.30, shuffle=False)

# Sanity check
print('Shape of X_text_train : {}'.format(X_text_train.shape))
print('Shape of y_train : {}'.format(y_train.shape))
print()
print('Shape of X_text_test : {}'.format(X_text_test.shape))
print('Shape of y_test : {}'.format(y_test.shape))

# Modelling

We will create KNN model based on the following features:
* **Bag of words**
* **tf-idf**
* **avg_word2vec**
* **tf-idf weighted word2vec**

In [None]:
def knn_cv(X_train,y_train):
    
    # values of K to iterate over.
    neighbors = list(np.arange(1,30,2))
    
    # To store acc and error corresponding to each k.
    cv_acc = {}
    cv_error = {}
    
    # perform 10-fold cross validation
    for k in neighbors:
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
        cv_acc[k] = scores.mean()
        cv_error[k] = 1-scores.mean()
       
    # Find the optimal k value
    optimal_k = min(cv_error, key=cv_error.get)
    
    return (cv_acc, cv_error, optimal_k)

## Bag Of Words

In [None]:
# ************** Creating Bag Of Words ***************************
count_vec = CountVectorizer()
X_train = count_vec.fit_transform(X_text_train)
print('Type of X_train : {}'.format(type(X_train)))
print('Shape of X_train : {}'.format(X_train.get_shape()))
print('Number of unique words : {}'.format(X_train.get_shape()[1]))

In [None]:
# ********************** 10 - fold Cross Validation **********************
start = datetime.datetime.now()
cv_acc, cv_error, optimal_k = knn_cv(X_train, y_train)

plt.figure(figsize=(12,6))
plt.plot(cv_acc.keys(), cv_acc.values())
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.title('KNN BOW ACCURACY')
plt.show()

print('The optimum value of K based on cross-validation is : {}'.format(optimal_k))
print('Time taken : {}'.format(datetime.datetime.now() - start))

In [None]:
# ********************* Accuracy on Test Data ***********************
X_test = count_vec.transform(X_text_test)
knn = KNeighborsClassifier(n_neighbors=optimal_k)
knn.fit(X_train, y_train)

pred = knn.predict(X_test)
acc = accuracy_score(y_test, pred)

print('Accuracy on Testing data is : {}'.format(acc))

In [None]:
knn.classes_

## TF-IDF

In [None]:
# ***************** Creating TFIDF Vectors ***************************
tfidf_vec = TfidfVectorizer()
X_train = tfidf_vec.fit_transform(X_text_train)
print('Type of X_train : {}'.format(type(X_train)))
print('Shape of X_train : {}'.format(X_train.get_shape()))
print('Number of unique words : {}'.format(X_train.get_shape()[1]))

In [None]:
# ******************* 10-fold Cross Validation************************
start = datetime.datetime.now()
cv_acc, cv_error, optimal_k = knn_cv(X_train, y_train)

plt.figure(figsize=(12,6))
plt.plot(cv_acc.keys(), cv_acc.values())
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.title('KNN TF-IDF ACCURACY')
plt.show()

print('The optimum value of K based on cross-validation is : {}'.format(optimal_k))
print('Time taken : {}'.format(datetime.datetime.now() - start))

In [None]:
# ****************** Accuracy on Test Data ********************
X_test = tfidf_vec.transform(X_text_test)
knn = KNeighborsClassifier(n_neighbors=optimal_k)
knn.fit(X_train, y_train)

pred = knn.predict(X_test)
acc = accuracy_score(y_test, pred)

print('Accuracy on Testing data is : {}'.format(acc))

## Average Word2Vec

In [None]:
# ******************* Tokenize training reviews ********************
review_list = []
for review in X_text_train:
    review_list.append(review.split())

print(X_text_train.iloc[0])
print()
print(review_list[0])

In [None]:
# ****************** Word2Vec **********************

# min_count = 5 considers only words that occured atleast 5 times
w2v_model=Word2Vec(review_list,min_count=5,size=50, workers=4)

w2v_words = list(w2v_model.wv.vocab)
print("number of words that occured minimum 5 times ",len(w2v_words))

In [None]:
# *************** Converting training reviews to vectors ********************

X_train = []; # the avg-w2v for each sentence/review is stored in this list
for review in review_list: # for each review/sentence
    review_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words = 0; # num of words with a valid vector in the sentence/review
    for word in review: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            review_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        review_vec /= cnt_words
    X_train.append(review_vec)


X_train = np.array(X_train)

print('Type of X_train : {}'.format(type(X_train)))
print('Shape of X_train : {}'.format(X_train.shape))
print('Number of dimensions : {}'.format(X_train.shape[1]))

In [None]:
# ******************* 10-fold Cross Validation************************
start = datetime.datetime.now()
cv_acc, cv_error, optimal_k = knn_cv(X_train, y_train)

plt.figure(figsize=(12,6))
plt.plot(cv_acc.keys(), cv_acc.values())
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.title('KNN AVG-W2V ACCURACY')
plt.show()

print('The optimum value of K based on cross-validation is : {}'.format(optimal_k))
print('Time taken : {}'.format(datetime.datetime.now() - start))

In [None]:
# ********** Tokenize test reviews *****************
review_list = []
for review in X_text_test:
    review_list.append(review.split())

print(X_text_test.iloc[0])
print()
print(review_list[0])

In [None]:
# **************** Converting test reviews to vectors ****************
X_test = []; # the avg-w2v for each sentence/review is stored in this list
for review in review_list: # for each review/sentence
    review_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words = 0; # num of words with a valid vector in the sentence/review
    for word in review: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            review_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        review_vec /= cnt_words
    X_test.append(review_vec)


X_test = np.array(X_test)

print('Type of X_test : {}'.format(type(X_test)))
print('Shape of X_test : {}'.format(X_test.shape))
print('Number of dimensions : {}'.format(X_test.shape[1]))

In [None]:
# ***************** Accuracy on Test Data **************************
knn = KNeighborsClassifier(n_neighbors=optimal_k)
knn.fit(X_train, y_train)

pred = knn.predict(X_test)
acc = accuracy_score(y_test, pred)

print('Accuracy on Testing data is : {}'.format(acc))

## TF-IDF Weighted Word2Vec

In [None]:
# ******************* Tokenize training reviews ********************
review_list = []
for review in X_text_train:
    review_list.append(review.split())

print(X_text_train.iloc[0])
print()
print(review_list[0])

In [None]:
# ********************** Word2Vec**************************
# min_count = 5 considers only words that occured atleast 5 times
w2v_model=Word2Vec(review_list,min_count=5,size=50, workers=4)

w2v_words = list(w2v_model.wv.vocab)
print("number of words that occured minimum 5 times ",len(w2v_words))

In [None]:
# *********************** TF-IDF **************************
tfidf_vec = TfidfVectorizer()
tfidf_vec.fit(X_text_train)
idf_dict = dict(zip(tfidf_vec.get_feature_names(), tfidf_vec.idf_))

In [None]:
# ***************** Converting Training Reviews to vectors *************

X_train = []; # the avg-w2v for each sentence/review is stored in this list
for review in review_list: # for each review/sentence
    review_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum = 0; # num of words with a valid vector in the sentence/review
    for word in review: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            tf_idf = idf_dict[word]*(review.count(word)/len(review))
            review_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        review_vec /= weight_sum
    X_train.append(review_vec)


X_train = np.array(X_train)

print('Type of X_train : {}'.format(type(X_train)))
print('Shape of X_train : {}'.format(X_train.shape))
print('Number of dimensions : {}'.format(X_train.shape[1]))

In [None]:
# ******************* 10-fold Cross Validation************************
start = datetime.datetime.now()
cv_acc, cv_error, optimal_k = knn_cv(X_train, y_train)

plt.figure(figsize=(12,6))
plt.plot(cv_acc.keys(), cv_acc.values())
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.title('KNN TFIDF-W2V ACCURACY')
plt.show()

print('The optimum value of K based on cross-validation is : {}'.format(optimal_k))
print('Time taken : {}'.format(datetime.datetime.now() - start))

In [None]:
# ********** Tokenize test reviews *****************
review_list = []
for review in X_text_test:
    review_list.append(review.split())

print(X_text_test.iloc[0])
print()
print(review_list[0])

In [None]:
# ****************** Converting Test reviews to vectors *************
X_test = []; # the avg-w2v for each sentence/review is stored in this list
for review in review_list: # for each review/sentence
    review_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum = 0; # num of words with a valid vector in the sentence/review
    for word in review: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            tf_idf = idf_dict[word]*(review.count(word)/len(review))
            review_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        review_vec /= weight_sum
    X_test.append(review_vec)


X_test = np.array(X_test)

print('Type of X_test : {}'.format(type(X_test)))
print('Shape of X_test : {}'.format(X_test.shape))
print('Number of dimensions : {}'.format(X_test.shape[1]))

In [None]:
# ***************** Accuracy on Test Data **************************
knn = KNeighborsClassifier(n_neighbors=optimal_k)
knn.fit(X_train, y_train)

pred = knn.predict(X_test)
acc = accuracy_score(y_test, pred)

print('Accuracy on Testing data is : {}'.format(acc))

# Conclusion
We got the best Test Accuracy with the following:
* **Model** : Average Word2Vec
* **Optimum K** : 15
* **Accuracy** : 0.847

In [None]:
print('Time Taken to run the entire notebook : {}'.format(datetime.datetime.now()-global_start))