# Tests performed using Linear Model

In [None]:
## Download data and extract csv from Zip files

# !wget https://data.deepai.org/quora_question_pairs.zip
# !unzip quora_question_pairs.zip
# !unzip train.csv.zip

In [None]:
# All external libraries used throughout the notbook are listed here

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from tqdm import tqdm
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier as logreg_w_sgd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

import pickle
import numpy as np
import pandas as pd
import nltk
import pickle

nltk.download('punkt')
nltk.download('stopwords')
ps = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Processing and splitting

In [None]:
ROOT = "./"

DATA_PATH = ROOT + "data/"

SAVED_MODEL_PATH = ROOT + "models/"

N_GRAM_FEAT_PATH = ROOT + "ngram_features/"
TREE_FEAT_PATH = ROOT + "tree_features/"

UNIGRAM_PATH = N_GRAM_FEAT_PATH + "unigrams/"
BIGRAM_PATH = N_GRAM_FEAT_PATH + "bigrams/"
TRIGRAM_PATH = N_GRAM_FEAT_PATH + "trigrams/"
UNIGRAM_SCRAP = N_GRAM_FEAT_PATH + "scrap/unigrams/"
BIGRAM_SCRAP = N_GRAM_FEAT_PATH + "scrap/bigrams/"
TRIGRAM_SCRAP = N_GRAM_FEAT_PATH + "scrap/trigrams/"


In [None]:
# train = pd.read_csv(DATA_PATH + 'train.csv', index_col=0)
# train.shape

In [None]:
# train.head()

In [None]:
# train_data, val_data, test_data = np.split(train.sample(frac=1, random_state=42), [int(.7*len(train)), int(.9*len(train))])

In [None]:
# train_data.shape, val_data.shape, test_data.shape

In [None]:
# train_data.to_csv('train_data.csv', index=False)
# val_data.to_csv('val_data.csv', index=False)
# test_data.to_csv('test_data.csv', index=False)

train_data = pd.read_csv(ROOT + "train_data.csv", index_col=0)
val_data = pd.read_csv(ROOT + "val_data.csv", index_col=0)
test_data = pd.read_csv(ROOT + "test_data.csv", index_col=0)

# Feature Engineering and feature vector creation

In [None]:
# !mkdir ngram_features
# !mkdir ngram_features/unigrams
# !mkdir ngram_features/bigrams
# !mkdir ngram_features/trigrams

In [None]:
#checking the count of unique words in data
word_set = set()
for i, row in tqdm(train.iterrows()):
    word_set.update(row['question1_preprocessed'].split())
    word_set.update(row['question2_preprocessed'].split())

In [None]:
len(word_set)

111549

## Creating Unigrams features

In [None]:
!ls

shell-init: error retrieving current directory: getcwd: cannot access parent directories: No such file or directory
data			  smai.ipynb	 train.csv
ngram_features		  test.csv	 train_data.csv
quora_question_pairs.zip  test_data.csv  val_data.csv


In [None]:
vectorizer = CountVectorizer(ngram_range=(1,1), max_features=128)

vectorizer.fit(train['question1_preprocessed'].tolist()+train['question2_preprocessed'].tolist())

train_unigrams = vectorizer.transform(train_data['question1_preprocessed'].astype(str)+train_data['question2_preprocessed'].astype(str))
sparse.save_npz("ngram_features/unigrams/train.npz", train_unigrams)

val_unigrams = vectorizer.transform(val_data['question1_preprocessed'].astype(str)+val_data['question2_preprocessed'].astype(str))
sparse.save_npz("ngram_features/unigrams/val.npz", val_unigrams)

test_unigrams = vectorizer.transform(test_data['question1_preprocessed'].astype(str)+test_data['question2_preprocessed'].astype(str))
sparse.save_npz("ngram_features/unigrams/test.npz", test_unigrams)

## Creating bi-gram features

In [None]:
vectorizer = CountVectorizer(ngram_range=(2,2), max_features=128)

vectorizer.fit(train['question1_preprocessed'].tolist()+train['question2_preprocessed'].tolist())

train_bigrams = vectorizer.transform(train_data['question1_preprocessed'].astype(str)+train_data['question2_preprocessed'].astype(str))
sparse.save_npz("ngram_features/bigrams/train.npz", train_bigrams)

val_bigrams = vectorizer.transform(val_data['question1_preprocessed'].astype(str)+val_data['question2_preprocessed'].astype(str))
sparse.save_npz("ngram_features/bigrams/val.npz", val_bigrams)

test_bigrams = vectorizer.transform(test_data['question1_preprocessed'].astype(str)+test_data['question2_preprocessed'].astype(str))
sparse.save_npz("ngram_features/bigrams/test.npz", test_bigrams)

## Creating tri-grams features

In [None]:
vectorizer = CountVectorizer(ngram_range=(3,3), max_features=128)

vectorizer.fit(train['question1_preprocessed'].tolist()+train['question2_preprocessed'].tolist())

train_trigrams = vectorizer.transform(train_data['question1_preprocessed'].astype(str)+train_data['question2_preprocessed'].astype(str))
sparse.save_npz("ngram_features/trigrams/train.npz", train_trigrams)

val_trigrams = vectorizer.transform(val_data['question1_preprocessed'].astype(str)+val_data['question2_preprocessed'].astype(str))
sparse.save_npz("ngram_features/trigrams/val.npz", val_trigrams)

test_trigrams = vectorizer.transform(test_data['question1_preprocessed'].astype(str)+test_data['question2_preprocessed'].astype(str))
sparse.save_npz("ngram_features/trigrams/test.npz", test_trigrams)

# Logistic Regression on n-gram features

**Train on *N-grams* features using basic ML classifier = Logistic Rregression.**

In [None]:
# Obtain the labels (= ground truth values)
train_labels = train_data.is_duplicate.values
val_labels = val_data.is_duplicate.values
test_labels = test_data.is_duplicate.values
# Concat train and val labels.
train_val_labels = np.concatenate((train_labels, val_labels))

**Replicate the Results as given in paper...**

**They claimed to have following parameters as the best.**
1. $\alpha = 0.00001$ $\longrightarrow$ the regularixer for $L_2$.
2. $i = 20 = $ $\#$ iterations using **SGD** as optimizer.
3. Let's try it out for `unigrams`, `bigrams`, and `trigrams`.

## Uni-Grams

In [None]:
# Load uni-grams data.
uni_train = sparse.load_npz(UNIGRAM_PATH + "train.npz")
uni_val = sparse.load_npz(UNIGRAM_PATH + "val.npz")
uni_test = sparse.load_npz(UNIGRAM_PATH + "test.npz")

In [None]:
# Concat uni_train and uni_val.
uni_train_val = sparse.vstack((uni_train, uni_val))

In [None]:
# define an instance of Logreg model.
LR = logreg_w_sgd(loss = 'log', penalty = 'l2', alpha = 0.00001, max_iter = 1000, tol = 0.001, 
                  n_jobs = -1, random_state = None, learning_rate = 'optimal')

In [None]:
# train the model.
LR.fit(uni_train_val, train_val_labels)

SGDClassifier(alpha=1e-05, loss='log', n_jobs=-1)

In [None]:
# predict on validation data.
predictions = LR.predict(uni_test)
# Accuracy and F1 score.
f1 = f1_score(test_labels, predictions)
acc = accuracy_score(test_labels, predictions)
# Print the scores.
print("Accuracy-Score for Unigrams (Logreg with SGD) = %.3f"%acc)
print("F1-Score for Unigrams (Logreg with SGD) = %.3f"%f1)

Accuracy-Score for Unigrams (Logreg with SGD) = 0.681
F1-Score for Unigrams (Logreg with SGD) = 0.426


## Bi-Grams

In [None]:
# Load bi-grams data.
bi_train = sparse.load_npz(BIGRAM_PATH + "train.npz")
bi_val = sparse.load_npz(BIGRAM_PATH + "val.npz")
bi_test = sparse.load_npz(BIGRAM_PATH + "test.npz")

In [None]:
# Concat bi_train and bi_val.
bi_train_val = sparse.vstack((bi_train, bi_val))

In [None]:
# define an instance of Logreg model.
LR_bi = logreg_w_sgd(loss = 'log', penalty = 'l2', alpha = 0.00001, max_iter = 1000, tol = 0.001, 
                  n_jobs = -1, random_state = None, learning_rate = 'optimal')

In [None]:
# train the model.
LR_bi.fit(bi_train_val, train_val_labels)

SGDClassifier(alpha=1e-05, loss='log', n_jobs=-1)

In [None]:
# predict on validation data.
predictions_bi = LR_bi.predict(bi_test)
# Accuracy and F1 score.
f1_bi = f1_score(test_labels, predictions_bi)
acc_bi= accuracy_score(test_labels, predictions_bi)
# Print the scores.
print("Accuracy-Score for bigrams (Logreg with SGD) = %.3f"%acc_bi)
print("F1-Score for bigrams (Logreg with SGD) = %.3f"%f1_bi)

Accuracy-Score for bigrams (Logreg with SGD) = 0.669
F1-Score for bigrams (Logreg with SGD) = 0.423


## Tri-Grams

In [None]:
# Load tri-grams data.
tri_train = sparse.load_npz(TRIGRAM_PATH + "train.npz")
tri_val = sparse.load_npz(TRIGRAM_PATH + "val.npz")
tri_test = sparse.load_npz(TRIGRAM_PATH + "test.npz")

In [None]:
# Concat tri_train and tri_val.
tri_train_val = sparse.vstack((tri_train, tri_val))

In [None]:
# define an instance of Logreg model.
LR_tri = logreg_w_sgd(loss = 'log', penalty = 'l2', alpha = 0.00001, max_iter = 1000, tol = 0.001, 
                      n_jobs = -1, random_state = None, learning_rate = 'optimal')

In [None]:
# train the model.
LR_tri.fit(tri_train_val, train_val_labels)

SGDClassifier(alpha=1e-05, loss='log', n_jobs=-1)

In [None]:
# predict on validation data.
predictions_tri = LR_tri.predict(tri_test)
# Accuracy and F1 score.
f1_tri = f1_score(test_labels, predictions_tri)
acc_tri= accuracy_score(test_labels, predictions_tri)
# Print the scores.
print("Accuracy-Score for trigrams (Logreg with SGD) = %.3f"%acc_tri)
print("F1-Score for trigrams (Logreg with SGD) = %.3f"%f1_tri)

Accuracy-Score for trigrams (Logreg with SGD) = 0.649
F1-Score for trigrams (Logreg with SGD) = 0.295


## Apply GridSearchCV

**Uni-Grams**

In [None]:
# Apply gridSearch CV.
parameters = {'alpha':[0.01, 0.001, 0.0001], 'eta0':[10, 0.99, 0.1]}# 1.5, 0.9
LR_1 = logreg_w_sgd(max_iter = 1000, tol = 0.0001, n_jobs = -1, loss="log",
                    learning_rate='optimal', random_state = 23, penalty="l2")
# Cv -splitter
cv_splitter = StratifiedKFold(n_splits=5, shuffle=False)
clf_uni = GridSearchCV(LR_1, parameters, scoring = ["accuracy", "f1"], n_jobs = -1, cv = cv_splitter, refit="accuracy")
clf_uni.fit(uni_train_val, train_val_labels)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=SGDClassifier(loss='log', n_jobs=-1, random_state=23,
                                     tol=0.0001),
             n_jobs=-1,
             param_grid={'alpha': [0.01, 0.001, 0.0001],
                         'eta0': [10, 0.99, 0.1]},
             refit='accuracy', scoring=['accuracy', 'f1'])

In [None]:
# Show the Results...
uni_cv = pd.DataFrame.from_dict(clf_uni.cv_results_)
uni_cv

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_eta0,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,1.352726,0.113564,0.070044,0.002829,0.01,10.0,"{'alpha': 0.01, 'eta0': 10}",0.682616,0.68121,0.681279,0.680674,0.682598,0.681675,0.000789,7,0.391464,0.388067,0.391393,0.387022,0.39477,0.390543,0.002756,7
1,1.233932,0.055308,0.072264,0.005923,0.01,0.99,"{'alpha': 0.01, 'eta0': 0.99}",0.682616,0.68121,0.681279,0.680674,0.682598,0.681675,0.000789,7,0.391464,0.388067,0.391393,0.387022,0.39477,0.390543,0.002756,7
2,1.224687,0.062557,0.070501,0.005588,0.01,0.1,"{'alpha': 0.01, 'eta0': 0.1}",0.682616,0.68121,0.681279,0.680674,0.682598,0.681675,0.000789,7,0.391464,0.388067,0.391393,0.387022,0.39477,0.390543,0.002756,7
3,2.063402,0.225909,0.067272,0.001535,0.001,10.0,"{'alpha': 0.001, 'eta0': 10}",0.690064,0.689166,0.687119,0.68907,0.689386,0.688961,0.000984,1,0.434811,0.454493,0.446373,0.448539,0.43958,0.444759,0.006891,1
4,2.039464,0.227162,0.066398,0.000211,0.001,0.99,"{'alpha': 0.001, 'eta0': 0.99}",0.690064,0.689166,0.687119,0.68907,0.689386,0.688961,0.000984,1,0.434811,0.454493,0.446373,0.448539,0.43958,0.444759,0.006891,1
5,2.051403,0.203774,0.069763,0.004127,0.001,0.1,"{'alpha': 0.001, 'eta0': 0.1}",0.690064,0.689166,0.687119,0.68907,0.689386,0.688961,0.000984,1,0.434811,0.454493,0.446373,0.448539,0.43958,0.444759,0.006891,1
6,4.335784,0.267849,0.067125,0.001497,0.0001,10.0,"{'alpha': 0.0001, 'eta0': 10}",0.688401,0.689221,0.686404,0.688287,0.689578,0.688378,0.001101,4,0.434796,0.440004,0.425236,0.427634,0.461065,0.437747,0.012778,4
7,4.330396,0.260508,0.070522,0.006986,0.0001,0.99,"{'alpha': 0.0001, 'eta0': 0.99}",0.688401,0.689221,0.686404,0.688287,0.689578,0.688378,0.001101,4,0.434796,0.440004,0.425236,0.427634,0.461065,0.437747,0.012778,4
8,4.130399,0.463411,0.064,0.010251,0.0001,0.1,"{'alpha': 0.0001, 'eta0': 0.1}",0.688401,0.689221,0.686404,0.688287,0.689578,0.688378,0.001101,4,0.434796,0.440004,0.425236,0.427634,0.461065,0.437747,0.012778,4


In [None]:
# Show the Best params, and Best Score...
print(f"CV for LR Unigrams:\nBest Parameters: {clf_uni.best_params_}\nBest Scores: {clf_uni.best_score_}")

CV for LR Unigrams:
Best Parameters: {'alpha': 0.001, 'eta0': 10}
Best Scores: 0.6889608913768414


In [None]:
# Print the best F1 and Accuracy score.
print("F1 score: ", np.max(uni_cv["mean_test_f1"]), "Accuracy Score", np.max(uni_cv["mean_test_accuracy"]))

F1 score:  0.44475912445310595 Accuracy Score 0.6889608913768414


In [None]:
# Now train the model with optimal hyper-parameters and obtain the result on test data-set.
logreg_optimal = logreg_w_sgd(max_iter = 1000,
                            tol = 0.0001,
                            n_jobs = -1,
                            loss="log",
                            learning_rate='optimal',
                            random_state = 23,
                            penalty="l2",
                            alpha=clf_uni.best_params_["alpha"],
                            eta0=clf_uni.best_params_["eta0"])
# Train the Logistic Regression with CV.
logreg_optimal = logreg_optimal.fit(uni_train_val, train_val_labels)
# Predict on vals.
pred = logreg_optimal.predict(uni_test)
# Accuracy and F1 score.
f1_logreg_optimal = f1_score(test_labels, pred)
acc_logreg_optimal = accuracy_score(test_labels, pred)
# Print the scores.
print("Accuracy-Score for Logictic Regression Classifier (with SGD and for Unigrams) with Optimal Params = %.3f"%acc_logreg_optimal)
print("F1-Score for Logictic Regression Classifier (with SGD and for Unigrams) with Optimal Params = %.3f"%f1_logreg_optimal)

Accuracy-Score for Logictic Regression Classifier (with SGD and for Unigrams) with Optimal Params = 0.689
F1-Score for Logictic Regression Classifier (with SGD and for Unigrams) with Optimal Params = 0.437


In [None]:
# save the model to disk
pickle.dump(clf_uni, open(ROOT + 'models/' + 'uni_CV_model.sav', 'wb'))

**Bi-Grams**

In [None]:
# Apply gridSearch CV.
parameters = {'alpha':[0.001, 0.0001, 0.00001], 'eta0':[10, 0.99, 0.1]}# 1.5, 0.9
LR_2 = logreg_w_sgd(max_iter = 1000, tol = 0.0001, n_jobs = -1, loss="log",
                    learning_rate='optimal', random_state = 23, penalty="l2")
# Cv -splitter
cv_splitter = StratifiedKFold(n_splits=5, shuffle=False)
clf_bi = GridSearchCV(LR_2, parameters, scoring = ["accuracy", "f1"], n_jobs = -1, cv = cv_splitter, refit="accuracy")
clf_bi.fit(bi_train_val, train_val_labels)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=SGDClassifier(loss='log', n_jobs=-1, random_state=23,
                                     tol=0.0001),
             n_jobs=-1,
             param_grid={'alpha': [0.001, 0.0001, 1e-05],
                         'eta0': [10, 0.99, 0.1]},
             refit='accuracy', scoring=['accuracy', 'f1'])

In [None]:
# Show the Results...
bi_cv = pd.DataFrame.from_dict(clf_bi.cv_results_)
bi_cv

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_eta0,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,1.193826,0.037511,0.066589,0.004883,0.001,10.0,"{'alpha': 0.001, 'eta0': 10}",0.672159,0.668416,0.669021,0.670244,0.671233,0.670215,0.001377,7,0.31588,0.31705,0.30843,0.312111,0.317345,0.314163,0.00342,7
1,1.139502,0.053283,0.072844,0.006857,0.001,0.99,"{'alpha': 0.001, 'eta0': 0.99}",0.672159,0.668416,0.669021,0.670244,0.671233,0.670215,0.001377,7,0.31588,0.31705,0.30843,0.312111,0.317345,0.314163,0.00342,7
2,1.142603,0.050903,0.063924,0.0018,0.001,0.1,"{'alpha': 0.001, 'eta0': 0.1}",0.672159,0.668416,0.669021,0.670244,0.671233,0.670215,0.001377,7,0.31588,0.31705,0.30843,0.312111,0.317345,0.314163,0.00342,7
3,2.091236,0.223464,0.063919,0.001306,0.0001,10.0,"{'alpha': 0.0001, 'eta0': 10}",0.673354,0.668939,0.670491,0.671591,0.672443,0.671364,0.001538,1,0.347291,0.334549,0.350603,0.359499,0.351356,0.34866,0.008118,4
4,2.094176,0.19438,0.063831,0.001,0.0001,0.99,"{'alpha': 0.0001, 'eta0': 0.99}",0.673354,0.668939,0.670491,0.671591,0.672443,0.671364,0.001538,1,0.347291,0.334549,0.350603,0.359499,0.351356,0.34866,0.008118,4
5,2.104404,0.223043,0.063721,0.000796,0.0001,0.1,"{'alpha': 0.0001, 'eta0': 0.1}",0.673354,0.668939,0.670491,0.671591,0.672443,0.671364,0.001538,1,0.347291,0.334549,0.350603,0.359499,0.351356,0.34866,0.008118,4
6,5.676921,0.241973,0.064784,0.001521,1e-05,10.0,"{'alpha': 1e-05, 'eta0': 10}",0.672901,0.666836,0.669351,0.672484,0.669639,0.670242,0.002229,4,0.354065,0.291682,0.422586,0.422486,0.32338,0.36284,0.052583,1
7,5.496557,0.282137,0.063608,0.001117,1e-05,0.99,"{'alpha': 1e-05, 'eta0': 0.99}",0.672901,0.666836,0.669351,0.672484,0.669639,0.670242,0.002229,4,0.354065,0.291682,0.422586,0.422486,0.32338,0.36284,0.052583,1
8,5.306336,0.307994,0.060061,0.009615,1e-05,0.1,"{'alpha': 1e-05, 'eta0': 0.1}",0.672901,0.666836,0.669351,0.672484,0.669639,0.670242,0.002229,4,0.354065,0.291682,0.422586,0.422486,0.32338,0.36284,0.052583,1


In [None]:
# Show the Best params, and Best Score...
print(f"CV for LR bigrams:\nBest Parameters: {clf_bi.best_params_}\nBest Scores: {clf_bi.best_score_}")

CV for LR bigrams:
Best Parameters: {'alpha': 0.0001, 'eta0': 10}
Best Scores: 0.67136350971767


In [None]:
# Print the best F1 and Accuracy score.
print("F1 score: ", np.max(bi_cv["mean_test_f1"]), "Accuracy Score", np.max(bi_cv["mean_test_accuracy"]))

F1 score:  0.3628397920253069 Accuracy Score 0.67136350971767


In [None]:
# Now train the model with optimal hyper-parameters and obtain the result on test data-set.
logreg_optimal2 = logreg_w_sgd(max_iter = 1000,
                            tol = 0.0001,
                            n_jobs = -1,
                            loss="log",
                            learning_rate='optimal',
                            random_state = 23,
                            penalty="l2",
                            alpha=clf_bi.best_params_["alpha"],
                            eta0=clf_bi.best_params_["eta0"])
# Train the Logistic Regression with CV.
logreg_optimal2 = logreg_optimal2.fit(bi_train_val, train_val_labels)
# Predict on vals.
pred2 = logreg_optimal2.predict(bi_test)
# Accuracy and F1 score.
f1_logreg_optimal2 = f1_score(test_labels, pred2)
acc_logreg_optimal2 = accuracy_score(test_labels, pred2)
# Print the scores.
print("Accuracy-Score for Logictic Regression Classifier (with SGD and for bigrams) with Optimal Params = %.3f"%acc_logreg_optimal2)
print("F1-Score for Logictic Regression Classifier (with SGD and for bigrams) with Optimal Params = %.3f"%f1_logreg_optimal2)

Accuracy-Score for Logictic Regression Classifier (with SGD and for bigrams) with Optimal Params = 0.667
F1-Score for Logictic Regression Classifier (with SGD and for bigrams) with Optimal Params = 0.337


In [None]:
# save the model to disk
pickle.dump(clf_bi, open(ROOT + 'models/' + 'bi_CV_model.sav', 'wb'))

**Tri-Grams**

In [None]:
# Apply gridSearch CV.
parameters = {'alpha':[0.01, 0.001, 0.0001, 0.00001], 'eta0':[10, 0.99], "tol":[0.01, 0.001, 0.0001]}# 1.5, 0.9
LR_3 = logreg_w_sgd(max_iter = 1000, n_jobs = -1, loss="log",
                    learning_rate='optimal', random_state = 23, penalty="l2")
# Cv -splitter
cv_splitter = StratifiedKFold(n_splits=5, shuffle=False)
clf_tri = GridSearchCV(LR_3, parameters, scoring = ["accuracy", "f1"], n_jobs = -1, cv = cv_splitter, refit="accuracy")
clf_tri.fit(tri_train_val, train_val_labels)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=SGDClassifier(loss='log', n_jobs=-1, random_state=23),
             n_jobs=-1,
             param_grid={'alpha': [0.01, 0.001, 0.0001, 1e-05],
                         'eta0': [10, 0.99], 'tol': [0.01, 0.001, 0.0001]},
             refit='accuracy', scoring=['accuracy', 'f1'])

In [None]:
# Show the Results...
tri_cv = pd.DataFrame.from_dict(clf_tri.cv_results_)
tri_cv

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_eta0,param_tol,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,0.741215,0.010177,0.072723,0.012163,0.01,10.0,0.01,"{'alpha': 0.01, 'eta0': 10, 'tol': 0.01}",0.640512,0.640191,0.64037,0.640617,0.640933,0.640525,0.000249,19,0.073061,0.073657,0.074674,0.073541,0.073995,0.073786,0.000536,19
1,0.831247,0.021899,0.064199,0.002605,0.01,10.0,0.001,"{'alpha': 0.01, 'eta0': 10, 'tol': 0.001}",0.640581,0.640109,0.64037,0.640246,0.640878,0.640437,0.00027,21,0.07189,0.073904,0.074739,0.071565,0.073525,0.073125,0.001211,21
2,1.040109,0.080018,0.062932,0.001264,0.01,10.0,0.0001,"{'alpha': 0.01, 'eta0': 10, 'tol': 0.0001}",0.640581,0.640081,0.640136,0.640233,0.640796,0.640365,0.000277,23,0.072219,0.073571,0.073254,0.071497,0.073181,0.072744,0.00077,23
3,0.706822,0.01765,0.062012,0.000278,0.01,0.99,0.01,"{'alpha': 0.01, 'eta0': 0.99, 'tol': 0.01}",0.640512,0.640191,0.64037,0.640617,0.640933,0.640525,0.000249,19,0.073061,0.073657,0.074674,0.073541,0.073995,0.073786,0.000536,19
4,0.8108,0.015727,0.061512,0.000532,0.01,0.99,0.001,"{'alpha': 0.01, 'eta0': 0.99, 'tol': 0.001}",0.640581,0.640109,0.64037,0.640246,0.640878,0.640437,0.00027,21,0.07189,0.073904,0.074739,0.071565,0.073525,0.073125,0.001211,21
5,1.045652,0.071643,0.062212,0.000863,0.01,0.99,0.0001,"{'alpha': 0.01, 'eta0': 0.99, 'tol': 0.0001}",0.640581,0.640081,0.640136,0.640233,0.640796,0.640365,0.000277,23,0.072219,0.073571,0.073254,0.071497,0.073181,0.072744,0.00077,23
6,0.724401,0.009618,0.062995,0.00128,0.001,10.0,0.01,"{'alpha': 0.001, 'eta0': 10, 'tol': 0.01}",0.654171,0.653095,0.65473,0.653259,0.65451,0.653953,0.00066,13,0.198324,0.198291,0.204319,0.20368,0.20255,0.201433,0.002614,17
7,0.820777,0.002597,0.062002,0.000334,0.001,10.0,0.001,"{'alpha': 0.001, 'eta0': 10, 'tol': 0.001}",0.65402,0.653136,0.654592,0.653562,0.654441,0.65395,0.000543,15,0.193994,0.205026,0.210751,0.202411,0.199421,0.202321,0.005588,15
8,0.927093,0.033233,0.06212,0.000177,0.001,10.0,0.0001,"{'alpha': 0.001, 'eta0': 10, 'tol': 0.0001}",0.654048,0.653246,0.654386,0.653617,0.65451,0.653961,0.000473,11,0.197091,0.203422,0.206318,0.203092,0.201943,0.202373,0.003009,13
9,0.707476,0.009448,0.063802,0.002108,0.001,0.99,0.01,"{'alpha': 0.001, 'eta0': 0.99, 'tol': 0.01}",0.654171,0.653095,0.65473,0.653259,0.65451,0.653953,0.00066,13,0.198324,0.198291,0.204319,0.20368,0.20255,0.201433,0.002614,17


In [None]:
# Show the Best params, and Best Score...
print(f"CV for LR trigrams:\nBest Parameters: {clf_tri.best_params_}\nBest Scores: {clf_tri.best_score_}")

CV for LR trigrams:
Best Parameters: {'alpha': 0.0001, 'eta0': 10, 'tol': 0.001}
Best Scores: 0.6568112533792007


In [None]:
# Print the best F1 and Accuracy score.
print("F1 score: ", np.max(tri_cv["mean_test_f1"]), "Accuracy Score", np.max(tri_cv["mean_test_accuracy"]))

F1 score:  0.3194196520253982 Accuracy Score 0.6568112533792007


In [None]:
# Now train the model with optimal hyper-parameters and obtain the result on test data-set.
logreg_optimal3 = logreg_w_sgd(max_iter = 1000,
                            tol = clf_tri.best_params_["tol"],
                            n_jobs = -1,
                            loss="log",
                            learning_rate='optimal',
                            random_state = 23,
                            penalty="l2",
                            alpha=clf_tri.best_params_["alpha"],
                            eta0=clf_tri.best_params_["eta0"])
# Train the Logistic Regression with CV.
logreg_optimal3 = logreg_optimal3.fit(tri_train_val, train_val_labels)
# Predict on vals.
pred3 = logreg_optimal3.predict(tri_test)
# Accuracy and F1 score.
f1_logreg_optimal3 = f1_score(test_labels, pred3)
acc_logreg_optimal3 = accuracy_score(test_labels, pred3)
# Print the scores.
print("Accuracy-Score for Logictic Regression Classifier (with SGD and for trigrams) with Optimal Params = %.3f"%acc_logreg_optimal3)
print("F1-Score for Logictic Regression Classifier (with SGD and for trigrams) with Optimal Params = %.3f"%f1_logreg_optimal3)

Accuracy-Score for Logictic Regression Classifier (with SGD and for trigrams) with Optimal Params = 0.651
F1-Score for Logictic Regression Classifier (with SGD and for trigrams) with Optimal Params = 0.242


In [None]:
# save the model to disk
pickle.dump(clf_tri, open(ROOT + 'models/' + 'tri_CV_model.sav', 'wb'))

# SVM on n-gram features

In [None]:
# Loading individual n-gram features

train_unigrams = sparse.load_npz(UNIGRAM_PATH + "train.npz")
val_unigrams = sparse.load_npz(UNIGRAM_PATH + "val.npz")
test_unigrams = sparse.load_npz(UNIGRAM_PATH + "test.npz")

train_bigrams = sparse.load_npz(BIGRAM_PATH + "train.npz")
val_bigrams = sparse.load_npz(BIGRAM_PATH + "val.npz")
test_bigrams = sparse.load_npz(BIGRAM_PATH + "test.npz")

train_trigrams = sparse.load_npz(TRIGRAM_PATH + "train.npz")
val_trigrams = sparse.load_npz(TRIGRAM_PATH + "val.npz")
test_trigrams = sparse.load_npz(TRIGRAM_PATH + "test.npz")


# Combining train and validation features for replicating the results

train_val_unigrams = sparse.vstack([train_unigrams, val_unigrams])
train_val_bigrams = sparse.vstack([train_bigrams, val_bigrams])
train_val_trigrams = sparse.vstack([train_trigrams, val_trigrams])

labels_train_val = np.array(train_data['is_duplicate'].tolist() + val_data['is_duplicate'].tolist())
labels_test = test_data['is_duplicate'].values


In [None]:
# Training SVM with parameters mentioned in the given research-paper

"""
Parameters:
  > C:        1.0
  > Kernel:   linear
"""

svm_classifier_unigrams = LinearSVC(C=1.0, random_state=0, max_iter=10000)
svm_classifier_bigrams  = LinearSVC(C=1.0, random_state=0, max_iter=10000)
svm_classifier_trigrams = LinearSVC(C=1.0, random_state=0, max_iter=10000)

svm_classifier_unigrams.fit(train_val_unigrams, labels_train_val)
svm_classifier_bigrams.fit(train_val_bigrams, labels_train_val)
svm_classifier_trigrams.fit(train_val_trigrams, labels_train_val)

pred_unigrams = svm_classifier_unigrams.predict(test_unigrams)
pred_bigrams = svm_classifier_bigrams.predict(test_bigrams)
pred_trigrams = svm_classifier_trigrams.predict(test_trigrams)

acc_unigrams = accuracy_score(labels_test, pred_unigrams)
acc_bigrams = accuracy_score(labels_test, pred_bigrams)
acc_trigrams = accuracy_score(labels_test, pred_trigrams)

f1_score_unigrams = f1_score(labels_test, pred_unigrams)
f1_score_bigrams = f1_score(labels_test, pred_bigrams)
f1_score_trigrams = f1_score(labels_test, pred_trigrams)


In [None]:
# Printing the accuracy and f1-scores of individual SVM-models trained on Unigrams, Bigrams and Trigrams

print("Accuracy attained by using SVM on Unigrams: {}".format(acc_unigrams*100))
print("Accuracy attained by using SVM on Bigrams: {}".format(acc_bigrams*100))
print("Accuracy attained by using SVM on Trigrams: {}".format(acc_trigrams*100))
print()
print("F1-Score by using Linear SVM on Unigrams: {}".format(f1_score_unigrams*100))
print("F1-Score by using Linear SVM on Bigrams: {}".format(f1_score_bigrams*100))
print("F1-Score by using Linear SVM on Trigrams: {}".format(f1_score_trigrams*100))


Accuracy attained by using SVM on Unigrams: 68.73531375992481
Accuracy attained by using SVM on Bigrams: 66.59081352494496
Accuracy attained by using SVM on Trigrams: 65.05973434910584

F1-Score by using Linear SVM on Unigrams: 43.13478495591146
F1-Score by using Linear SVM on Bigrams: 32.427835309420175
F1-Score by using Linear SVM on Trigrams: 24.81371087928465


In [None]:
# f = open(SAVED_MODEL_PATH + "linear_SVM_models.pkl", "wb")
# pickle.dump(svm_classifier_unigrams, f)
# pickle.dump(svm_classifier_bigrams, f)
# pickle.dump(svm_classifier_trigrams, f)
# f.close()

f = open(SAVED_MODEL_PATH + "linear_SVM_models.pkl", "rb")
test1 = pickle.load(f)
test2 = pickle.load(f)
test3 = pickle.load(f)
f.close()

In [None]:
# Apply grid-search on SVM hyperparamters

svm_parameters = {'C':[0.005, 0.001, 0.1, 0.5, 1.0, 10, 50], 'kernel':['linear', 'rbf', 'poly', 'sigmoid']}
SVM_classifier = SVC(max_iter = 3500, random_state = 0)

splinter = StratifiedKFold(n_splits=2)
best_SVM_params = GridSearchCV(SVM_classifier, svm_parameters, scoring=["accuracy", "f1"], n_jobs = -1, cv=splinter, refit="accuracy", verbose=1)
best_SVM_params.fit(train_val_unigrams, labels_train_val)


Fitting 2 folds for each of 28 candidates, totalling 56 fits




GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False),
             estimator=SVC(max_iter=3500, random_state=0), n_jobs=-1,
             param_grid={'C': [0.005, 0.001, 0.1, 0.5, 1.0, 10, 50],
                         'kernel': ['linear', 'rbf', 'poly', 'sigmoid']},
             refit='accuracy', scoring=['accuracy', 'f1'], verbose=1)

In [None]:
# f = open(SAVED_MODEL_PATH + "grid_searched_SVM.pkl", "wb")
# pickle.dump(best_SVM_params, f)
# f.close()

f = open(SAVED_MODEL_PATH + "grid_searched_SVM.pkl", "rb")
best_SVM_params = pickle.load(f)
f.close()


In [None]:
grid_search_summary = pd.DataFrame.from_dict(best_SVM_params.cv_results_)
display(grid_search_summary)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_accuracy,split1_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_f1,split1_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,118.345005,2.804352,95.19486,0.402248,0.005,linear,"{'C': 0.005, 'kernel': 'linear'}",0.378572,0.384445,0.381508,0.002936,22,0.541788,0.542798,0.542293,0.000505,2
1,193.460175,0.653045,125.232381,1.635609,0.005,rbf,"{'C': 0.005, 'kernel': 'rbf'}",0.620356,0.614104,0.61723,0.003126,1,0.05381,0.304009,0.17891,0.125099,25
2,154.72335,0.55276,101.216167,0.637046,0.005,poly,"{'C': 0.005, 'kernel': 'poly'}",0.368975,0.368944,0.36896,1.5e-05,28,0.539012,0.538998,0.539005,7e-06,8
3,248.858496,31.40068,171.923608,11.294714,0.005,sigmoid,"{'C': 0.005, 'kernel': 'sigmoid'}",0.434044,0.545303,0.489673,0.05563,8,0.436478,0.445783,0.441131,0.004652,21
4,142.532297,2.252987,86.766445,1.419781,0.001,linear,"{'C': 0.001, 'kernel': 'linear'}",0.372493,0.374259,0.373376,0.000883,24,0.539634,0.540513,0.540074,0.00044,3
5,203.42645,1.485101,123.137109,0.698535,0.001,rbf,"{'C': 0.001, 'kernel': 'rbf'}",0.620356,0.614104,0.61723,0.003126,1,0.05381,0.304009,0.17891,0.125099,25
6,156.241127,0.555942,92.151558,2.864732,0.001,poly,"{'C': 0.001, 'kernel': 'poly'}",0.369063,0.368944,0.369004,5.9e-05,27,0.539021,0.538998,0.539009,1.2e-05,7
7,244.352408,29.765282,165.450538,14.256268,0.001,sigmoid,"{'C': 0.001, 'kernel': 'sigmoid'}",0.435088,0.531743,0.483416,0.048327,14,0.434466,0.451583,0.443025,0.008558,18
8,157.494532,0.115077,107.62874,0.829143,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.380732,0.388781,0.384757,0.004025,21,0.542184,0.542739,0.542461,0.000278,1
9,213.841783,0.717565,129.41065,1.996982,0.1,rbf,"{'C': 0.1, 'kernel': 'rbf'}",0.620356,0.614104,0.61723,0.003126,1,0.05381,0.304009,0.17891,0.125099,25


In [None]:
print("Grid searched best SVM params: {}".format(best_SVM_params.best_params_))

Grid searched best SVM params: {'C': 0.005, 'kernel': 'rbf'}


In [34]:
svm_parameters = {'C':[0.005, 0.001, 0.1, 0.5, 1.0, 5, 10, 50]}

LinearSVC_classifier_unigrams = LinearSVC(max_iter = 3500, random_state = 0)
splinter = StratifiedKFold(n_splits=2)
best_LinearSVC_params_unigrams = GridSearchCV(LinearSVC_classifier_unigrams, svm_parameters, scoring=["accuracy", "f1"], n_jobs = -1, cv=splinter, refit="accuracy", verbose=1)
best_LinearSVC_params_unigrams.fit(train_val_unigrams, labels_train_val)

LinearSVC_classifier_bigrams = LinearSVC(max_iter = 3500, random_state = 0)
splinter = StratifiedKFold(n_splits=2)
best_LinearSVC_params_bigrams = GridSearchCV(LinearSVC_classifier_bigrams, svm_parameters, scoring=["accuracy", "f1"], n_jobs = -1, cv=splinter, refit="accuracy", verbose=1)
best_LinearSVC_params_bigrams.fit(train_val_bigrams, labels_train_val)

LinearSVC_classifier_trigrams = LinearSVC(max_iter = 3500, random_state = 0)
splinter = StratifiedKFold(n_splits=2)
best_LinearSVC_params_trigrams = GridSearchCV(LinearSVC_classifier_trigrams, svm_parameters, scoring=["accuracy", "f1"], n_jobs = -1, cv=splinter, refit="accuracy", verbose=1)
best_LinearSVC_params_trigrams.fit(train_val_trigrams, labels_train_val)


Fitting 2 folds for each of 8 candidates, totalling 16 fits
Fitting 2 folds for each of 8 candidates, totalling 16 fits
Fitting 2 folds for each of 8 candidates, totalling 16 fits


GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False),
             estimator=LinearSVC(max_iter=3500, random_state=0), n_jobs=-1,
             param_grid={'C': [0.005, 0.001, 0.1, 0.5, 1.0, 5, 10, 50]},
             refit='accuracy', scoring=['accuracy', 'f1'], verbose=1)

In [35]:
f = open(SAVED_MODEL_PATH + "grid_searched_LinearSVC_params.pkl", "wb")
pickle.dump(best_LinearSVC_params_unigrams, f)
pickle.dump(best_LinearSVC_params_bigrams, f)
pickle.dump(best_LinearSVC_params_trigrams, f)
f.close()

# f = open(SAVED_MODEL_PATH + "grid_searched_LinearSVC.pkl", "rb")
# best_LinearSVC_params = pickle.load(f)
# f.close()


In [36]:
grid_search_unigrams_summary = pd.DataFrame.from_dict(best_LinearSVC_params_unigrams.cv_results_)
display(grid_search_unigrams_summary)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_accuracy,split1_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_f1,split1_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,3.602817,0.465158,0.1673,0.003321,0.005,{'C': 0.005},0.688233,0.688122,0.688178,5.6e-05,4,0.427545,0.426637,0.427091,0.000454,6
1,1.341896,0.103199,0.167981,0.000906,0.001,{'C': 0.001},0.68781,0.687891,0.687851,4e-05,6,0.422566,0.422266,0.422416,0.00015,8
2,51.301317,10.549561,0.175049,0.001725,0.1,{'C': 0.1},0.688239,0.688193,0.688216,2.3e-05,3,0.428576,0.42793,0.428253,0.000323,5
3,278.062006,7.386645,0.170134,0.000567,0.5,{'C': 0.5},0.68825,0.68821,0.68823,2e-05,1,0.42863,0.428036,0.428333,0.000297,3
4,345.080702,2.925272,0.178745,0.002735,1.0,{'C': 1.0},0.688244,0.68821,0.688227,1.7e-05,2,0.428626,0.428036,0.428331,0.000295,4
5,336.024166,7.205442,0.165138,0.001003,5.0,{'C': 5},0.688189,0.688149,0.688169,2e-05,5,0.428767,0.427919,0.428343,0.000424,2
6,308.307969,17.128601,0.17643,0.012338,10.0,{'C': 10},0.687728,0.687891,0.687809,8.2e-05,7,0.42189,0.429991,0.425941,0.00405,7
7,305.437063,0.688725,0.142601,0.024249,50.0,{'C': 50},0.666665,0.677079,0.671872,0.005207,8,0.522022,0.471135,0.496578,0.025443,1


In [37]:
grid_search_bigrams_summary = pd.DataFrame.from_dict(best_LinearSVC_params_bigrams.cv_results_)
display(grid_search_bigrams_summary)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_accuracy,split1_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_f1,split1_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,1.174966,0.03027,0.187645,0.00675,0.005,{'C': 0.005},0.670853,0.670544,0.670698,0.000155,6,0.324344,0.323132,0.323738,0.000606,7
1,1.038479,0.020852,0.178488,0.007804,0.001,{'C': 0.001},0.67037,0.67006,0.670215,0.000155,7,0.313184,0.313046,0.313115,6.9e-05,8
2,9.619829,0.233024,0.169079,0.00702,0.1,{'C': 0.1},0.67181,0.67072,0.671265,0.000545,1,0.332006,0.329295,0.330651,0.001355,6
3,43.706191,1.089869,0.17022,0.006379,0.5,{'C': 0.5},0.671832,0.670687,0.671259,0.000573,2,0.332155,0.329483,0.330819,0.001336,4
4,88.143099,2.847341,0.164018,0.000633,1.0,{'C': 1.0},0.671837,0.670681,0.671259,0.000578,3,0.332174,0.329495,0.330834,0.00134,2
5,284.438186,7.221329,0.16475,0.000804,5.0,{'C': 5},0.671826,0.67067,0.671248,0.000578,5,0.332166,0.329487,0.330827,0.001339,3
6,288.042127,7.321761,0.161891,0.000358,10.0,{'C': 10},0.671826,0.670676,0.671251,0.000575,4,0.332166,0.329431,0.330799,0.001368,5
7,278.747956,1.364035,0.138709,0.024587,50.0,{'C': 50},0.668847,0.670807,0.669827,0.00098,8,0.335821,0.329159,0.33249,0.003331,1


In [38]:
grid_search_trigrams_summary = pd.DataFrame.from_dict(best_LinearSVC_params_trigrams.cv_results_)
display(grid_search_trigrams_summary)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_accuracy,split1_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_f1,split1_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,0.936675,0.028357,0.190954,0.001371,0.005,{'C': 0.005},0.657002,0.656544,0.656773,0.000229,7,0.251775,0.246597,0.249186,0.002589,7
1,0.781138,0.035424,0.176145,0.008158,0.001,{'C': 0.001},0.654457,0.655109,0.654783,0.000326,8,0.209514,0.213788,0.211651,0.002137,8
2,5.024284,0.88617,0.164593,0.002689,0.1,{'C': 0.1},0.656969,0.65722,0.657094,0.000125,5,0.257878,0.256852,0.257365,0.000513,6
3,30.258925,14.22,0.164515,0.000152,0.5,{'C': 0.5},0.656969,0.65728,0.657125,0.000156,4,0.257984,0.257434,0.257709,0.000275,5
4,57.85276,22.063451,0.164197,0.000744,1.0,{'C': 1.0},0.656969,0.657286,0.657127,0.000158,1,0.257984,0.257455,0.25772,0.000264,2
5,195.19195,24.678682,0.164239,0.000759,5.0,{'C': 5},0.656963,0.657286,0.657125,0.000161,2,0.257963,0.257455,0.257709,0.000254,3
6,206.218724,11.956155,0.162653,0.000535,10.0,{'C': 10},0.656963,0.657286,0.657125,0.000161,2,0.257963,0.257455,0.257709,0.000254,3
7,194.722654,1.019526,0.139215,0.026722,50.0,{'C': 50},0.657024,0.657137,0.657081,5.7e-05,6,0.258843,0.258591,0.258717,0.000126,1


In [41]:
print("Grid searched best LinearSVC_unigrams params: {}".format(best_LinearSVC_params_unigrams.best_params_))
print("Grid searched best LinearSVC_bigrams params: {}".format(best_LinearSVC_params_bigrams.best_params_))
print("Grid searched best LinearSVC_trigrams params: {}".format(best_LinearSVC_params_trigrams.best_params_))


Grid searched best LinearSVC_unigrams params: {'C': 0.5}
Grid searched best LinearSVC_bigrams params: {'C': 0.1}
Grid searched best LinearSVC_trigrams params: {'C': 1.0}


In [42]:
# Training SVM with parameters mentioned in the given research-paper

"""
Parameters:
  > C (for Unigram features): 0.5
  > C (for Biigram features): 0.1
  [LinearSVM for trigram is same as executed before]
"""

svm_classifier_unigrams = LinearSVC(C=0.5, random_state=0, max_iter=10000)
svm_classifier_bigrams  = LinearSVC(C=0.1, random_state=0, max_iter=10000)

svm_classifier_unigrams.fit(train_val_unigrams, labels_train_val)
svm_classifier_bigrams.fit(train_val_bigrams, labels_train_val)

pred_unigrams = svm_classifier_unigrams.predict(test_unigrams)
pred_bigrams = svm_classifier_bigrams.predict(test_bigrams)

acc_unigrams = accuracy_score(labels_test, pred_unigrams)
acc_bigrams = accuracy_score(labels_test, pred_bigrams)

f1_score_unigrams = f1_score(labels_test, pred_unigrams)
f1_score_bigrams = f1_score(labels_test, pred_bigrams)


In [43]:
print("Accuracy attained by using SVM on Unigrams: {}".format(acc_unigrams*100))
print("Accuracy attained by using SVM on Bigrams: {}".format(acc_bigrams*100))
print("Accuracy attained by using SVM on Trigrams: {}".format(acc_trigrams*100))
print()
print("F1-Score by using Linear SVM on Unigrams: {}".format(f1_score_unigrams*100))
print("F1-Score by using Linear SVM on Bigrams: {}".format(f1_score_bigrams*100))
print("F1-Score by using Linear SVM on Trigrams: {}".format(f1_score_trigrams*100))


Accuracy attained by using SVM on Unigrams: 68.73531375992481
Accuracy attained by using SVM on Bigrams: 66.58834005293231
Accuracy attained by using SVM on Trigrams: 65.05973434910584

F1-Score by using Linear SVM on Unigrams: 43.13478495591146
F1-Score by using Linear SVM on Bigrams: 32.4194516710026
F1-Score by using Linear SVM on Trigrams: 24.81371087928465


### Testing the effect of training the SVM model with N-gram features of individual questions and combining them to create a single feature vector.

In [None]:
# Testing effect on individual question features

train_q1_unigrams = sparse.load_npz(UNIGRAM_SCRAP + "train_q1.npz")
train_q2_unigrams = sparse.load_npz(UNIGRAM_SCRAP + "train_q2.npz")
val_q1_unigrams = sparse.load_npz(UNIGRAM_SCRAP + "val_q1.npz")
val_q2_unigrams = sparse.load_npz(UNIGRAM_SCRAP + "val_q2.npz")
test_q1_unigrams = sparse.load_npz(UNIGRAM_SCRAP + "test_q1.npz")
test_q2_unigrams = sparse.load_npz(UNIGRAM_SCRAP + "test_q2.npz")

train_q1_bigrams = sparse.load_npz(BIGRAM_SCRAP + "train_q1.npz")
train_q2_bigrams = sparse.load_npz(BIGRAM_SCRAP + "train_q2.npz")
val_q1_bigrams = sparse.load_npz(BIGRAM_SCRAP + "val_q1.npz")
val_q2_bigrams = sparse.load_npz(BIGRAM_SCRAP + "val_q2.npz")
test_q1_bigrams = sparse.load_npz(BIGRAM_SCRAP + "test_q1.npz")
test_q2_bigrams = sparse.load_npz(BIGRAM_SCRAP + "test_q2.npz")

train_q1_trigrams = sparse.load_npz(TRIGRAM_SCRAP + "train_q1.npz")
train_q2_trigrams = sparse.load_npz(TRIGRAM_SCRAP + "train_q2.npz")
val_q1_trigrams = sparse.load_npz(TRIGRAM_SCRAP + "val_q1.npz")
val_q2_trigrams = sparse.load_npz(TRIGRAM_SCRAP + "val_q2.npz")
test_q1_trigrams = sparse.load_npz(TRIGRAM_SCRAP + "test_q1.npz")
test_q2_trigrams = sparse.load_npz(TRIGRAM_SCRAP + "test_q2.npz")

# Combining individual train features to create train, val and test vectors

train_unigrams_combined = sparse.hstack((train_q1_unigrams, train_q2_unigrams))
val_unigrams_combined = sparse.hstack((val_q1_unigrams, val_q2_unigrams))
test_unigrams_combined = sparse.hstack((test_q1_unigrams, test_q2_unigrams))

train_bigrams_combined = sparse.hstack((train_q1_bigrams, train_q2_bigrams))
val_bigrams_combined = sparse.hstack((val_q1_bigrams, val_q2_bigrams))
test_bigrams_combined = sparse.hstack((test_q1_bigrams, test_q2_bigrams))

train_trigrams_combined = sparse.hstack((train_q1_trigrams, train_q2_trigrams))
val_trigrams_combined = sparse.hstack((val_q1_trigrams, val_q2_trigrams))
test_trigrams_combined = sparse.hstack((test_q1_trigrams, test_q2_trigrams))

# For training SVM, train and val can be combined
train_val_unigrams_scrap = sparse.vstack((train_unigrams_combined, val_unigrams_combined))
train_val_bigrams_scrap = sparse.vstack((train_bigrams_combined, val_bigrams_combined))
train_val_trigrams_scrap = sparse.vstack((train_trigrams_combined, val_trigrams_combined))

# Storing labels for train_val and test
train_val_labels = np.array(train_data['is_duplicate'].tolist() + val_data['is_duplicate'].tolist())
test_labels = test_data['is_duplicate'].values

print("Combined individual question features")


Combined individual question features


In [None]:
svm_classifier_unigrams_scrap = LinearSVC(C=1.0, random_state=0, max_iter=10000)
svm_classifier_bigrams_scrap  = LinearSVC(C=1.0, random_state=0, max_iter=10000)
svm_classifier_trigrams_scrap = LinearSVC(C=1.0, random_state=0, max_iter=10000)

svm_classifier_unigrams_scrap.fit(train_val_unigrams_scrap, train_val_labels)
svm_classifier_bigrams_scrap.fit(train_val_bigrams_scrap, train_val_labels)
svm_classifier_trigrams_scrap.fit(train_val_trigrams_scrap, train_val_labels)

pred_unigrams_scrap = svm_classifier_unigrams_scrap.predict(test_unigrams_combined)
pred_bigrams_scrap = svm_classifier_bigrams_scrap.predict(test_bigrams_combined)
pred_trigrams_scrap = svm_classifier_trigrams_scrap.predict(test_trigrams_combined)

acc_unigrams_scrap = accuracy_score(test_labels, pred_unigrams_scrap)
acc_bigrams_scrap = accuracy_score(test_labels, pred_bigrams_scrap)
acc_trigrams_scrap = accuracy_score(test_labels, pred_trigrams_scrap)

f1_score_unigrams_scrap = f1_score(test_labels, pred_unigrams_scrap)
f1_score_bigrams_scrap = f1_score(test_labels, pred_bigrams_scrap)
f1_score_trigrams_scrap = f1_score(test_labels, pred_trigrams_scrap)


In [None]:
# Printing the accuracy and f1-scores of individual SVM-models trained on Unigrams, Bigrams and Trigrams

print("Accuracy attained by using SVM on Unigrams: {}".format(acc_unigrams_scrap*100))
print("Accuracy attained by using SVM on Bigrams: {}".format(acc_bigrams_scrap*100))
print("Accuracy attained by using SVM on Trigrams: {}".format(acc_trigrams_scrap*100))
print()
print("F1-Score by using Linear SVM on Unigrams: {}".format(f1_score_unigrams_scrap*100))
print("F1-Score by using Linear SVM on Bigrams: {}".format(f1_score_bigrams_scrap*100))
print("F1-Score by using Linear SVM on Trigrams: {}".format(f1_score_trigrams_scrap*100))


Accuracy attained by using SVM on Unigrams: 68.75510153602612
Accuracy attained by using SVM on Bigrams: 66.74911573375547
Accuracy attained by using SVM on Trigrams: 65.06468129313117

F1-Score by using Linear SVM on Unigrams: 43.460746575955596
F1-Score by using Linear SVM on Bigrams: 33.3878400475695
F1-Score by using Linear SVM on Trigrams: 25.198601843025102


In [None]:
# f = open(SAVED_MODEL_PATH + "linear_SVM_models_scrap.pkl", "wb")
# pickle.dump(svm_classifier_unigrams_scrap, f)
# pickle.dump(svm_classifier_bigrams_scrap, f)
# pickle.dump(svm_classifier_trigrams_scrap, f)
# f.close()

f = open(SAVED_MODEL_PATH + "linear_SVM_models.pkl", "rb")
test1 = pickle.load(f)
test2 = pickle.load(f)
test3 = pickle.load(f)
f.close()

In [None]:
# Apply grid-search on SVM hyperparamters

svm_parameters = {'C':[0.005, 0.001, 0.1, 0.5, 1.0, 10, 50], 'kernel':['rbf', 'poly', 'sigmoid']}
SVM_classifier = SVC(max_iter = 3500, random_state = 0)

splinter = StratifiedKFold(n_splits=5)
best_SVM_params = GridSearchCV(SVM_classifier, svm_parameters, scoring=["accuracy", "f1"], n_jobs = -1, cv=splinter, refit="accuracy", verbose=1)
best_SVM_params.fit(train_val_unigrams_scrap, train_val_labels)


Fitting 5 folds for each of 21 candidates, totalling 105 fits


In [None]:
f = open(SAVED_MODEL_PATH + "grid_searched_SVM_unigrams_params.pkl", "wb")
pickle.dump(best_SVM_params, f)
f.close()

# --------------------------------------------- END ---------------------------------------------