In [2]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv("train_features.csv")

In [23]:
df.head(2)

Unnamed: 0,is_duplicate,q1_stem,q2_stem,total_num_unique_words,total_diff_words,total_words,q1_words,q2_words
0,0,"['step', 'step', 'guid', 'invest', 'share', 'm...","['step', 'step', 'guid', 'invest', 'share', 'm...",8,1,13,7,6
1,0,"['stori', 'kohinoor', 'koh-i-noor', 'diamond']","['would', 'happen', 'indian', 'govern', 'stole...",11,5,13,4,9


In [4]:
X = df[['q1_words','q2_words','total_num_unique_words','total_diff_words','total_words']]
y = df.is_duplicate

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [5]:
NB = MultinomialNB()
NB.fit(X_train,y_train)

NB_ypred = NB.predict(X_test)

In [6]:
print('accuracy score: ', accuracy_score(y_test, NB_ypred))
print('classification report: ', classification_report(y_test, NB_ypred))

accuracy score:  0.6057285611813302
classification report:                precision    recall  f1-score   support

           0       0.64      0.84      0.73     50803
           1       0.44      0.21      0.28     30055

    accuracy                           0.61     80858
   macro avg       0.54      0.52      0.51     80858
weighted avg       0.57      0.61      0.56     80858



In [7]:
LR = LogisticRegression()
LR.fit(X_train,y_train)

LR_ypred = LR.predict(X_test)

In [8]:
print('accuracy score: ', accuracy_score(y_test, LR_ypred))
print('classification report: ', classification_report(y_test, LR_ypred))

accuracy score:  0.6548393479927774
classification report:                precision    recall  f1-score   support

           0       0.70      0.80      0.74     50803
           1       0.55      0.41      0.47     30055

    accuracy                           0.65     80858
   macro avg       0.62      0.60      0.61     80858
weighted avg       0.64      0.65      0.64     80858



In [9]:
RFC = RandomForestClassifier()
RFC.fit(X_train,y_train)

RFC_ypred = RFC.predict(X_test)

In [10]:
print('accuracy score: ', accuracy_score(y_test, RFC_ypred))
print('classification report: ', classification_report(y_test, RFC_ypred))

accuracy score:  0.7004378045462416
classification report:                precision    recall  f1-score   support

           0       0.75      0.78      0.77     50803
           1       0.61      0.56      0.58     30055

    accuracy                           0.70     80858
   macro avg       0.68      0.67      0.67     80858
weighted avg       0.70      0.70      0.70     80858



In [11]:
DTC = DecisionTreeClassifier()
DTC.fit(X_train,y_train)

DTC_ypred = DTC.predict(X_test)

In [12]:
print('accuracy score: ', accuracy_score(y_test, DTC_ypred))
print('classification report: ', classification_report(y_test, DTC_ypred))

accuracy score:  0.7004501719063049
classification report:                precision    recall  f1-score   support

           0       0.75      0.78      0.77     50803
           1       0.61      0.56      0.58     30055

    accuracy                           0.70     80858
   macro avg       0.68      0.67      0.67     80858
weighted avg       0.70      0.70      0.70     80858



In [13]:
KNC = KNeighborsClassifier()
KNC.fit(X_train,y_train)

KNC_ypred = KNC.predict(X_test)

In [14]:
print('accuracy score: ', accuracy_score(y_test, KNC_ypred))
print('classification report: ', classification_report(y_test, KNC_ypred))

accuracy score:  0.6622844987508967
classification report:                precision    recall  f1-score   support

           0       0.72      0.76      0.74     50803
           1       0.55      0.49      0.52     30055

    accuracy                           0.66     80858
   macro avg       0.63      0.63      0.63     80858
weighted avg       0.66      0.66      0.66     80858



In [15]:
GBC = GradientBoostingClassifier()
GBC.fit(X_train,y_train)

GBC_ypred = GBC.predict(X_test)

In [16]:
print('accuracy score: ', accuracy_score(y_test, GBC_ypred))
print('classification report: ', classification_report(y_test, GBC_ypred))

accuracy score:  0.6865863612753221
classification report:                precision    recall  f1-score   support

           0       0.74      0.77      0.76     50803
           1       0.58      0.54      0.56     30055

    accuracy                           0.69     80858
   macro avg       0.66      0.66      0.66     80858
weighted avg       0.68      0.69      0.68     80858



In [18]:
param1 = {}
param1['classifier__n_estimators'] = [10, 50, 100, 250]
param1['classifier__max_depth'] = [5, 10, 20]
param1['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param1['classifier'] = [RFC]


param2 = {}
param2['classifier__C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2]
param2['classifier__penalty'] = ['l1', 'l2']
param2['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param2['classifier'] = [LR]

param3 = {}
param3['classifier__max_depth'] = [5,10,25,None]
param3['classifier__min_samples_split'] = [2,5,10]
param3['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param3['classifier'] = [DTC]

param4 = {}
param4['classifier__n_neighbors'] = [2,5,10,25,50]
param4['classifier'] = [KNC]

param5 = {}
param5['classifier__alpha'] = [10**0, 10**1, 10**2]
param5['classifier'] = [NB]

param6 = {}
param6['classifier__n_estimators'] = [10, 50, 100, 250]
param6['classifier__max_depth'] = [5, 10, 20]
param6['classifier'] = [GBC]


pipeline = Pipeline([('classifier', RFC)])
params = [param1, param2, param3, param4, param5, param6]

In [19]:
%%time

gs = GridSearchCV(pipeline, params, cv=3, n_jobs=-1, scoring='roc_auc').fit(X_train, y_train)

 0.76981899 0.77014581 0.77046016 0.77079101 0.77088811 0.77092024
 0.71806594 0.72347641 0.7234869  0.72371658 0.76561315 0.76725728
 0.76753729 0.76702517 0.77022379 0.77057343 0.7705791  0.77062007
 0.71758197 0.72069303 0.72120916 0.72147541 0.76450796 0.76503373
 0.76437974 0.76455691 0.76990021 0.77010025 0.77025547 0.77023774
 0.71331786 0.71095053 0.71420555 0.71373361 0.75883252 0.75962143
 0.75921833 0.75953105 0.76947127 0.76949211 0.76955377 0.76963159
        nan 0.71840669        nan 0.72322851        nan 0.72445969
        nan 0.72524368        nan 0.71841175        nan 0.72323431
        nan 0.72446189        nan 0.72524663        nan 0.71841013
        nan 0.72323351        nan 0.72446146        nan 0.72524717
        nan 0.71841011        nan 0.72323351        nan 0.72446146
        nan 0.72524717        nan 0.71841011        nan 0.72323351
        nan 0.72446146        nan 0.72524717 0.72688237 0.72688237
 0.72688237 0.77060091 0.77061796 0.77063521 0.76976534 0.7700

Wall time: 11min 3s


In [20]:
gs.best_params_

{'classifier': GradientBoostingClassifier(max_depth=5, n_estimators=250),
 'classifier__max_depth': 5,
 'classifier__n_estimators': 250}

In [21]:
gs.best_score_

0.7723212549509894

In [22]:
gs.cv_results_

{'mean_fit_time': array([1.82916824e+00, 1.17248334e+01, 2.34571671e+01, 5.77184974e+01,
        3.85600034e+00, 1.64011662e+01, 3.26340003e+01, 7.91911670e+01,
        3.42050099e+00, 1.67966673e+01, 3.37265011e+01, 8.48463342e+01,
        2.39316614e+00, 1.14434985e+01, 2.29708350e+01, 5.73561664e+01,
        3.80916564e+00, 1.68343339e+01, 3.31998324e+01, 8.19255008e+01,
        3.81616863e+00, 1.74746683e+01, 3.63480008e+01, 8.89628331e+01,
        2.54399959e+00, 1.14854984e+01, 2.28663358e+01, 5.68763334e+01,
        3.53149923e+00, 1.69851682e+01, 3.35463324e+01, 8.48271674e+01,
        3.89416575e+00, 1.82795006e+01, 3.67583334e+01, 9.79660009e+01,
        2.64283395e+00, 1.12780000e+01, 2.23443333e+01, 5.55850005e+01,
        3.61683353e+00, 1.82113334e+01, 3.41623310e+01, 8.47721650e+01,
        4.21716499e+00, 1.91174999e+01, 4.38704989e+01, 7.75226662e+01,
        4.11672592e-02, 9.73500331e-01, 7.11692969e-02, 1.16099707e+00,
        4.78324890e-02, 1.13050302e+00, 5.98319

{'loss': ['deviance', 'exponential'],
 'classifier__n_estimators': [200, 250, 350],
 'classifier__max_depth': [2, 3, 4, 5, 6, 7],
 'classifier__min_samples_split': [1, 2, 3, 4, 5],
 'classifier__min_samples_leaf': [1, 2, 3],
 'classifier': [GradientBoostingClassifier(max_depth=5, n_estimators=250)]}

In [13]:
param = {}
param['loss'] = ['deviance', 'exponential']
param['learning_rate'] = [0.1,0.2]
param['n_estimators'] = [200,250]
param['max_depth'] = [2,3,4]
param['min_samples_split'] = [2,3]
param['min_samples_leaf'] = [1,2]

GBC = GradientBoostingClassifier()
GB_GS = GridSearchCV(GBC, param, cv=3, n_jobs=-1, scoring='roc_auc').fit(X_train, y_train)

Wall time: 30min 38s


In [14]:
GB_GS.best_score_

0.772247992803298

In [15]:
GB_GS.best_params_

{'learning_rate': 0.2,
 'loss': 'deviance',
 'max_depth': 4,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 250}