In [None]:
!pip install wandb
!git clone https://github.com/cambridgeltl/mirror-bert.git
!mv mirror-bert mirror
!pip install pytorch-metric-learning
!pip install transformers
!pip install -U sentence-transformers


Collecting wandb
  Downloading wandb-0.12.7-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 2.8 MB/s 
[?25hCollecting yaspin>=1.0.0
  Downloading yaspin-2.1.0-py3-none-any.whl (18 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 51.2 MB/s 
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting configparser>=3.8.1
  Downloading configparser-5.2.0-py3-none-any.whl (19 kB)
Collecting subprocess32>=3.5.3
  Downloading subprocess32-3.5.4.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 6.1 MB/s 
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.0-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 45.2 MB/s 
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting 

In [None]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn import metrics

# 1. Quora Data

##1.1 Cleaning (One time)

In [None]:
data = pd.read_csv("train.csv")
data.rename(columns={'is_duplicate':'y'}, inplace=True)
data.drop_duplicates(subset=['question1', 'question2'], inplace=True)
data['y'].value_counts()

0    255027
1    149263
Name: y, dtype: int64

In [None]:
data.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,y
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0


In [None]:
sampled_data = data[['question1', 'question2', 'y']].sample(n=40000)

In [None]:
sampled_data.to_csv("quora.csv", index=False)

## 1.2 Load Data & Embed (Start directly from here)

In [None]:
data = pd.read_csv("quora.csv")

In [None]:
data.head(2)

Unnamed: 0,question1,question2,y
0,"According to scientists, does God exist?","What is probability of existence of God, accor...",0
1,Which is the best online test series for gate ...,Which is best online test series for gate-meta...,0


## 1.3 Model and techniques

### 1.3.1 MIRROR BERT

In [None]:
from mirror.src.mirror_bert import MirrorBERT

In [None]:
def embed(data, model_name = "cambridgeltl/mirror-roberta-base-sentence-drophead"):
    mirror_bert = MirrorBERT()
    mirror_bert.load_model(path=model_name, use_cuda=True)
    embeddings = mirror_bert.get_embeddings(data, agg_mode="cls").cpu().detach().numpy()
    return embeddings

In [None]:
q1 = embed(data['question1'])
q2 = embed(data['question2'])
X = np.hstack((q1,q2))
y = data['y'].values

Downloading:   0%|          | 0.00/278 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

100%|██████████| 40/40 [03:27<00:00,  5.19s/it]
100%|██████████| 40/40 [03:27<00:00,  5.20s/it]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

#### 1.3.1.1 MIRROR BERT (Logistic Regression)

In [None]:
model = GridSearchCV(estimator=LogisticRegression(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'C':[10**x for x in range(-4, 5)]}, cv=4)

model.fit(X_train, y_train)
model.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogisticRegression(C=0.01, class_weight='balanced')

In [None]:
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.7202020872969459
Test ROCAUC 0.6914141834141834


#### 1.3.1.2 MIRROR BERT (Random Forest)

In [None]:
model = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'n_estimators':[10, 50, 100, 200], 'max_depth':[2,5,8]}, cv=3)

model.fit(X_train, y_train)
model.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=8, n_estimators=200)

In [None]:
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.8474109094243983
Test ROCAUC 0.6946694386694386


#### 1.3.1.3 MIRROR BERT (XGBoost)

In [None]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['eval_metric'] = 'auc'
dtrain = xgb.DMatrix(X_train, label=y_train)
model = xgb.train(param, dtrain)

In [None]:
dtest = xgb.DMatrix(X_test)
pred = model.predict(dtest)

pred = model.predict(dtrain)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(dtest)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.7387265997677038
Test ROCAUC 0.7204575652575653


### 1.3.2 Universal Sentence Encoder (USE)

In [None]:
#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 700.00MB
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [None]:
q1 = np.zeros((data.shape[0], 512))
for i, text in enumerate(data['question1']):
    q1[i] = embed([text]).numpy()
    # if i%200==0:
    #     print(i)
q2 = np.zeros((data.shape[0], 512))
for i, text in enumerate(data['question2']):
    q2[i] = embed([text]).numpy()
    # if i%200==0:
    #     print(i)

X = np.hstack((q1,q2))
y = data['y'].values
y = data['y'].values

In [None]:
del model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

#### 1.3.2.1 USE (Logistic regression)

In [None]:
model = GridSearchCV(estimator=LogisticRegression(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'C':[10**x for x in range(-4, 5)]}, cv=4)

model.fit(X_train, y_train)
model.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogisticRegression(C=1, class_weight='balanced')

In [None]:
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.7149337878426202
Test ROCAUC 0.6897777777777777


#### 1.3.2.2 USE (Random Forest regression)

In [None]:
model = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'n_estimators':[10, 50, 100, 200], 'max_depth':[2,5,8]}, cv=4)

model.fit(X_train, y_train)
model.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=8, n_estimators=200)

In [None]:
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.8126636617707856
Test ROCAUC 0.7139477939477938


#### 1.3.2.3 USE (XGBOOST)

In [None]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['eval_metric'] = 'auc'
dtrain = xgb.DMatrix(X_train, label=y_train)
model = xgb.train(param, dtrain)

dtest = xgb.DMatrix(X_test)
pred = model.predict(dtest)

pred = model.predict(dtrain)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(dtest)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.7559868628773067
Test ROCAUC 0.7411350427350427


### 1.3.3 Sentence BERT

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def embed(sentences):
    return model.encode(sentences)

In [None]:
q1 = embed(data['question1'])
q2 = embed(data['question2'])
X = np.hstack((q1,q2))
y = data['y'].values
del model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

#### 1.3.3.1 Sentence BERT (Logistic regression)

In [None]:
model = GridSearchCV(estimator=LogisticRegression(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'C':[10**x for x in range(-4, 5)]}, cv=4)

model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

GridSearchCV(cv=4, estimator=LogisticRegression(class_weight='balanced'),
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000,
                               10000]},
             scoring='roc_auc')

In [None]:
print(model.best_estimator_)
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

LogisticRegression(C=0.1, class_weight='balanced')
Train ROCAUC:  0.6910346957618987
Test ROCAUC 0.6683418803418804


#### 1.3.3.2 Sentence BERT (Random Forest)

In [None]:
model = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'n_estimators':[10, 50, 100, 200], 'max_depth':[2,5,8]}, cv=4)

model.fit(X_train, y_train)
model.best_estimator_

pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.8369912100193132
Test ROCAUC 0.6992838992838992


#### 1.3.3.3 Sentence BERT (XGBOOST)

In [None]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['eval_metric'] = 'auc'
dtrain = xgb.DMatrix(X_train, label=y_train)
model = xgb.train(param, dtrain)

dtest = xgb.DMatrix(X_test)
pred = model.predict(dtest)

pred = model.predict(dtrain)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(dtest)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.7681107688670321
Test ROCAUC 0.752744214984215


<tr>
    <th> Embedding </th>
    <th> Algorithm </th>
    <th> ROCAUC </th>
</tr>
<tr>
    <td> MIRROR BERT </td>
    <td> Logistic Regression </td>
    <td> 0.6914 </td>
</tr>
<tr>
    <td> MIRROR BERT </td>
    <td> Random Forest </td>
    <td> 0.6946 </td>
</tr>
<tr>
    <td> MIRROR BERT </td>
    <td> XGBOOST </td>
    <td> 0.7204 </td>
</tr>

<tr>
    <td> USE </td>
    <td> Logistic Regression </td>
    <td> 0.6187 </td>
</tr>
<tr>
    <td> USE </td>
    <td> Random Forest </td>
    <td> 0.7139 </td>
</tr>
<tr>
    <td> USE </td>
    <td> XGBOOST </td>
    <td> 0.7411 </td>
</tr>
<tr>
    <td> Sentence BERT </td>
    <td> Logistic Regression </td>
    <td> 0.6683 </td>
</tr>
<tr>
    <td> Sentence BERT </td>
    <td> Random Forest </td>
    <td> 0.6993 </td>
</tr>
<tr>
    <td> Sentence BERT </td>
    <td> XGBOOST </td>
    <td> 0.7527 </td>
</tr>