In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
cd /content/gdrive/MyDrive/SMAI_Project_2021

/content/gdrive/.shortcut-targets-by-id/1AtQJMrriX3WOs5N0Pda_qEvUQ7x_w36i/SMAI_Project_2021


# Import all Essential Libraries.

In [3]:
# All external libraries used throughout the notbook are listed here

from tqdm import tqdm
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier as logreg_w_sgd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import pickle
import numpy as np
import pandas as pd
import nltk

In [4]:
ROOT = "/content/gdrive/MyDrive/SMAI_Project_2021/"

SAVED_MODEL_PATH = ROOT + "models/"

N_GRAM_FEAT_PATH = ROOT + "ngram_features/"
TREE_FEAT_PATH = ROOT + "tree_features/"

UNIGRAM_PATH = N_GRAM_FEAT_PATH + "unigrams/"
BIGRAM_PATH = N_GRAM_FEAT_PATH + "bigrams/"
TRIGRAM_PATH = N_GRAM_FEAT_PATH + "trigrams/"
UNIGRAM_SCRAP = N_GRAM_FEAT_PATH + "scrap/unigrams/"
BIGRAM_SCRAP = N_GRAM_FEAT_PATH + "scrap/bigrams/"
TRIGRAM_SCRAP = N_GRAM_FEAT_PATH + "scrap/trigrams/"

# Tree-Feature Engineering

In [None]:
!mkdir tree_features

In [None]:
total_sentences = train['question1'].tolist() + train['question2'].tolist()
max_len_sent = len(max(total_sentences, key=lambda x : len(word_tokenize(str(x)))).split())

max_len_sent

237

In [None]:
stop_words = stopwords.words('english')

class CreateTreeFeatures():
    def __init__(self):
        pass
    
    def get_length_feature(self, question1, question2, vector):
        l1 = len(question1.split())
        l2 = len(question2.split())
        vector.append(l1)
        vector.append(l2)
        vector.append(l1-l2)
        vector.append(l1/l2)

        return vector

    def get_count_lowercased(self, question1, question2, vector):
        q1 = [word for word in word_tokenize(question1) if word.islower()]
        q2 = [word for word in word_tokenize(question2) if word.islower()]

        count = len(set(q1) & set(q2))
        
        vector.append(count)
        vector.append(count/237)

        return vector
    
    def get_count_lowercased_without_stopwords(self, question1, question2, vector):
        q1 = [word for word in word_tokenize(question1) if word.islower() and word not in stop_words]
        q2 = [word for word in word_tokenize(question2) if word.islower() and word not in stop_words]

        count = len(set(q1) & set(q2))
        
        vector.append(count)
        vector.append(count/237)

        return vector
    
    def same_last_words(self, question1, question2, vector):
        q1 = [word for word in word_tokenize(question1)]
        q2 = [word for word in word_tokenize(question2)]

        vector.append(q1[-1]==q2[-1])

        return vector
    
    def get_count_uppercased(self, question1, question2, vector):
        q1 = [word for word in word_tokenize(question1) if word.isupper()]
        q2 = [word for word in word_tokenize(question2) if word.isupper()]

        count = len(set(q1) & set(q2))
        
        vector.append(count)
        vector.append(count/237)

        return vector
    
    def same_prefix(self, question1, question2, vector):
        q1 = [word for word in word_tokenize(question1)]
        q2 = [word for word in word_tokenize(question2)]

        vector.append(sum([q1[:i]==q2[:i] for i in range(3)]))
        vector.append(sum([q1[:i]==q2[:i] for i in range(3)])/237)
        vector.append(sum([q1[:i]==q2[:i] for i in range(4)]))
        vector.append(sum([q1[:i]==q2[:i] for i in range(4)])/237)
        vector.append(sum([q1[:i]==q2[:i] for i in range(5)]))
        vector.append(sum([q1[:i]==q2[:i] for i in range(5)])/237)
        vector.append(sum([q1[:i]==q2[:i] for i in range(6)]))
        vector.append(sum([q1[:i]==q2[:i] for i in range(6)])/237)

        return vector
    
    def misc_features(self, question1, question2, vector):
        q1 = [word for word in word_tokenize(question1)]
        q2 = [word for word in word_tokenize(question2)]

        vector.append('not' in q1)
        vector.append('not' in q2)
        vector.append('not' in q1 and 'not' in q2)

        q11 = [word for word in q1 if word.isdigit()]
        q21 = [word for word in word_tokenize(question2) if word.isdigit()]
        vector.append(len(set(q11) and set(q21)))

        q12 = [ps.stem(word) for word in q1]
        q22 = [ps.stem(word) for word in q2]

        vector.append(len(set(q12) & set(q22)))
        vector.append(len(set(q12) & set(q22))/237)

        return vector
    
    def create_feature_vector(self, question1, question2):
        vector = []        
        vector = self.get_length_feature(question1, question2, vector)
        vector = self.get_count_lowercased(question1, question2, vector)
        vector = self.get_count_lowercased_without_stopwords(question1, question2, vector)
        vector = self.same_last_words(question1, question2, vector)
        vector = self.get_count_uppercased(question1, question2, vector)
        vector = self.same_prefix(question1, question2, vector)
        vector = self.misc_features(question1, question2, vector)

        return vector

In [None]:
create_tree_feature = CreateTreeFeatures()

In [None]:
!ls

data		quora_question_pairs.zip  test_data.csv   tree_features
models		smai.ipynb		  train.csv	  val_data.csv
ngram_features	test.csv		  train_data.csv


In [None]:
feature_vector = []
for i, row in tqdm(train_data.iterrows()):
    feature_vector.append(create_tree_feature.create_feature_vector(str(row['question1_preprocessed']), str(row['question2_preprocessed'])))

with open('tree_features/train.pickle', 'wb') as f:
    pickle.dump(feature_vector, f)

In [None]:
feature_vector = []
for i, row in tqdm(val_data.iterrows()):
    feature_vector.append(create_tree_feature.create_feature_vector(str(row['question1_preprocessed']), str(row['question2_preprocessed'])))

with open('tree_features/val.pickle', 'wb') as f:
    pickle.dump(feature_vector, f)

80858it [03:29, 386.61it/s]


In [None]:
feature_vector = []
for i, row in tqdm(test_data.iterrows()):
    feature_vector.append(create_tree_feature.create_feature_vector(str(row['question1_preprocessed']), str(row['question2_preprocessed'])))

with open('tree_features/test.pickle', 'wb') as f:
    pickle.dump(feature_vector, f)

40429it [01:44, 387.83it/s]


# Import the DataSet.

In [5]:
# train_data.to_csv('train_data.csv', index=False)
# val_data.to_csv('val_data.csv', index=False)
# test_data.to_csv('test_data.csv', index=False)
#with open(TREE_FEAT_PATH + "train.pickle", "rb") as f_train:

train_data = pd.DataFrame(pd.read_pickle(TREE_FEAT_PATH + "train.pickle"))
val_data = pd.DataFrame(pd.read_pickle(TREE_FEAT_PATH + "val.pickle"))
test_data = pd.DataFrame(pd.read_pickle(TREE_FEAT_PATH + "test.pickle"))

# Combine both Training and Validation data.
train_val_data = pd.concat([train_data, val_data], axis = 0, ignore_index=True)

In [6]:
# Import labels...
train_labels = pd.read_csv(ROOT + "train_data.csv", index_col=0).is_duplicate.values
val_labels = pd.read_csv(ROOT + "val_data.csv", index_col=0).is_duplicate.values
test_labels = pd.read_csv(ROOT + "test_data.csv", index_col=0).is_duplicate.values

# Combine train and val labels.
train_val_labels = np.concatenate((train_labels, val_labels))

# Replicate Paper Results

## Decision Tree training...

In [10]:
# Create an instance of decision Tree..
dt = DecisionTreeClassifier(max_depth=10, min_samples_leaf=5)
# Train the DT.
dt = dt.fit(train_val_data, train_val_labels)

In [11]:
# Predict on vals.
pred = dt.predict(test_data)
# Accuracy and F1 score.
f1_dt = f1_score(test_labels, pred)
acc_dt = accuracy_score(test_labels, pred)
# Print the scores.
print("Accuracy-Score for Decision Tree = %.3f"%acc_dt)
print("F1-Score for Decision Tree = %.3f"%f1_dt)

Accuracy-Score for Decision Tree = 0.720
F1-Score for Decision Tree = 0.634


## Random Forest Training.

In [None]:
# Create an instance of decision Tree..
rf = RandomForestClassifier(max_depth=None, n_estimators = 50, min_samples_leaf=5, n_jobs = -1)
# Train the DT.
rf = rf.fit(train_val_data, train_val_labels)

In [None]:
# Predict on vals.
pred_rf = rf.predict(test_data)
# Accuracy and F1 score.
f1_rf = f1_score(test_labels, pred_rf)
acc_rf = accuracy_score(test_labels, pred_rf)
# Print the scores.
print("Accuracy-Score for Random Forest Classifier = %.3f"%acc_rf)
print("F1-Score for Random Forest Classifier = %.3f"%f1_rf)

Accuracy-Score for Random Forest Classifier = 0.735
F1-Score for Random Forest Classifier = 0.646


## Gradient Boosting Trees.

In [None]:
# Create an instance of decision Tree..
gb = GradientBoostingClassifier(max_depth=4, n_estimators = 500)
# Train the DT.
gb = gb.fit(train_val_data, train_val_labels)

In [None]:
# Predict on vals.
pred_gb = gb.predict(test_data)
# Accuracy and F1 score.
f1_gb = f1_score(test_labels, pred_gb)
acc_gb = accuracy_score(test_labels, pred_gb)
# Print the scores.
print("Accuracy-Score for Gradient Boosting Tree = %.3f"%acc_gb)
print("F1-Score for Gradient Boosting Tree = %.3f"%f1_gb)

Accuracy-Score for Decision Tree = 0.733
F1-Score for Decision Tree = 0.646


# Applying Cross-Validation...

## Decision Tree.

In [17]:
# Apply gridSearch CV.
params_dt = {"criterion":["gini", "entropy"],
             "min_samples_leaf":[1, 4, 16],
             "max_depth":[10, 25, 50],
             "min_samples_split":[64, 128, 256]}# "max_features":["sqrt", "log2"], ,
dt_ = DecisionTreeClassifier(max_features = "log2")
# Cv -splitter
cv_splitter = StratifiedKFold(n_splits=5, shuffle=False)
dt_cv = GridSearchCV(dt_, params_dt, scoring = ["accuracy", "f1"], n_jobs = -1, cv = cv_splitter, refit="accuracy")
dt_cv.fit(train_val_data, train_val_labels)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=DecisionTreeClassifier(max_features='log2'), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [10, 25, 50],
                         'min_samples_leaf': [1, 4, 16],
                         'min_samples_split': [64, 128, 256]},
             refit='accuracy', scoring=['accuracy', 'f1'])

In [None]:
# Show the Results...
dt_res = pd.DataFrame.from_dict(dt_cv.cv_results_)
#dt_res

In [19]:
# Show the Best params, and Best Score...
print(f"CV for Decision Tree:\nBest Parameters: {dt_cv.best_params_}\nBest Scores: {dt_cv.best_score_}")

CV for Decision Tree:
Best Parameters: {'criterion': 'gini', 'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 128}
Best Scores: 0.7198628061714372


In [20]:
# Print the best F1 and Accuracy score.
print("F1 score: ", np.max(dt_res["mean_test_f1"]), "Accuracy Score", np.max(dt_res["mean_test_accuracy"]))

F1 score:  0.6220847584857725 Accuracy Score 0.7198628061714372


In [21]:
# Now train the model with optimal hyper-parameters and obtain the result on test data-set.
dt_optimal = DecisionTreeClassifier(max_features = "log2", 
                                    criterion = dt_cv.best_params_["criterion"],
                                    max_depth = dt_cv.best_params_["max_depth"],
                                    min_samples_split = dt_cv.best_params_["min_samples_split"],
                                    min_samples_leaf = dt_cv.best_params_["min_samples_leaf"])
# Train the DT.
dt_optimal = dt_optimal.fit(train_val_data, train_val_labels)
# Predict on vals.
pred = dt_optimal.predict(test_data)
# Accuracy and F1 score.
f1_dt_optimal = f1_score(test_labels, pred)
acc_dt_optimal = accuracy_score(test_labels, pred)
# Print the scores.
print("Accuracy-Score for Decision Tree with Optimal Params = %.3f"%acc_dt_optimal)
print("F1-Score for Decision Tree with Optimal Params = %.3f"%f1_dt_optimal)

Accuracy-Score for Decision Tree with Optimal Params = 0.725
F1-Score for Decision Tree with Optimal Params = 0.634


## Random Forest.

In [28]:
# Apply gridSearch CV.
params_rf = {"n_estimators":[200], 
             "criterion": ["entropy"], 
             "max_depth":[25],
             "min_samples_leaf":[1, 4],
             "min_samples_split":[32, 64, 128]}
             # "min_samples_split":[2, 6, 16], "min_samples_leaf":[1, 4, 16],
rf_ = RandomForestClassifier(n_jobs=-1, min_samples_leaf = 4, max_features = "log2")
# Cv -splitter
cv_splitter = StratifiedKFold(n_splits=5, shuffle=False)
rf_cv = GridSearchCV(rf_, params_rf, scoring = ["accuracy", "f1"], n_jobs = -1, cv = cv_splitter, refit="accuracy")
rf_cv.fit(train_val_data, train_val_labels)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(max_features='log2',
                                              min_samples_leaf=4, n_jobs=-1),
             n_jobs=-1,
             param_grid={'criterion': ['entropy'], 'max_depth': [25],
                         'min_samples_leaf': [1, 4],
                         'min_samples_split': [32, 64, 128],
                         'n_estimators': [200]},
             refit='accuracy', scoring=['accuracy', 'f1'])

In [29]:
# Show the Results...
rf_res = pd.DataFrame.from_dict(rf_cv.cv_results_)
#rf_res

In [30]:
# Show the Best params, and Best Score...
print(f"CV for Random Forest:\nBest Parameters: {rf_cv.best_params_}\nBest Scores: {rf_cv.best_score_}")

CV for Random Forest:
Best Parameters: {'criterion': 'entropy', 'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 64, 'n_estimators': 200}
Best Scores: 0.731257272651476


In [31]:
# Print the best F1 and Accuracy score.
print("F1 score: ", np.max(rf_res["mean_test_f1"]), "Accuracy Score", np.max(rf_res["mean_test_accuracy"]))

F1 score:  0.6422394175643088 Accuracy Score 0.731257272651476


In [32]:
# Now train the model with optimal hyper-parameters and obtain the result on test data-set.
rf_optimal = RandomForestClassifier(n_jobs=-1, 
                                    min_samples_leaf = rf_cv.best_params_["min_samples_leaf"],
                                    min_samples_split = rf_cv.best_params_["min_samples_split"],
                                    max_features = "log2", 
                                    criterion = rf_cv.best_params_["criterion"],
                                    max_depth = rf_cv.best_params_["max_depth"],
                                    n_estimators = rf_cv.best_params_["n_estimators"])
# Train the DT.
rf_optimal = rf_optimal.fit(train_val_data, train_val_labels)
# Predict on vals.
pred = rf_optimal.predict(test_data)
# Accuracy and F1 score.
f1_rf_optimal = f1_score(test_labels, pred)
acc_rf_optimal = accuracy_score(test_labels, pred)
# Print the scores.
print("Accuracy-Score for Random Forest Classifier with Optimal Params = %.3f"%acc_rf_optimal)
print("F1-Score for Random Forest Classifier with Optimal Params = %.3f"%f1_rf_optimal)

Accuracy-Score for Random Forest Classifier with Optimal Params = 0.737
F1-Score for Random Forest Classifier with Optimal Params = 0.651
