In [None]:
!pip install wandb
!git clone https://github.com/cambridgeltl/mirror-bert.git
!mv mirror-bert mirror
!pip install pytorch-metric-learning
!pip install transformers
!pip install -U sentence-transformers


Collecting wandb
  Downloading wandb-0.12.7-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 4.1 MB/s 
Collecting configparser>=3.8.1
  Downloading configparser-5.2.0-py3-none-any.whl (19 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.0-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 42.4 MB/s 
[?25hCollecting subprocess32>=3.5.3
  Downloading subprocess32-3.5.4.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 4.5 MB/s 
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting yaspin>=1.0.0
  Downloading yaspin-2.1.0-py3-none-any.whl (18 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 48.6 MB/s 
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting 

In [None]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn import metrics

# 1. Amazon Fine Food Reviews Data

##1.1 Cleaning (One time)

In [None]:
import re
# as per recommendation from @freylis, compile once only
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, ' ', raw_html)
  cleantext = cleantext.replace("  ", " ")
  
  return cleantext

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

data = pd.read_csv("Reviews.csv")
## removing duplicate text reviews
data.drop_duplicates(subset=['Text'], inplace=True)
## Converting score >= 4 to positive label, score <=2 to negative label, and removing rows with score == 3 
data['y'] = data['Score'].apply(lambda x: 0 if x<=2 else x)
data['y'] = data['y'].apply(lambda x: 1 if x>=4 else x)
data['y'] = data['y'].apply(lambda x: np.nan if x==3 else x)
data.dropna(subset=['y'], inplace=True)
data = data[['Text', 'y']]

sampled_data = data.sample(n=50000)

sampled_data['Text'] = sampled_data['Text'].apply(lambda x: decontracted(cleanhtml(x)))
sampled_data.to_csv("clean_reviews.csv", index=False)

## 1.2 Load Data & Embed (Start directly from here)

In [None]:
data = pd.read_csv("clean_reviews.csv")

In [None]:
data.head(2)

Unnamed: 0,Text,y
0,. . . in my household! At the beginning of the...,1.0
1,My husband and I get this iced tea when we eat...,1.0


In [None]:
data['y'].value_counts()

1.0    42127
0.0     7873
Name: y, dtype: int64

## 1.3 Model and techniques

### 1.3.1 MIRROR BERT

In [None]:
from mirror.src.mirror_bert import MirrorBERT



In [None]:
def embed(data, model_name = "cambridgeltl/mirror-roberta-base-sentence-drophead"):
    mirror_bert = MirrorBERT()
    mirror_bert.load_model(path=model_name, use_cuda=True)
    embeddings = mirror_bert.get_embeddings(data, agg_mode="cls").cpu().detach().numpy()
    return embeddings

In [None]:
X = embed(data['Text'])
y = data['y'].values

Downloading:   0%|          | 0.00/278 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

100%|██████████| 49/49 [04:54<00:00,  6.01s/it]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

#### 1.3.1.1 MIRROR BERT (Logistic Regression)

In [None]:
model = GridSearchCV(estimator=LogisticRegression(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'C':[10**x for x in range(-4, 5)]}, cv=4)

model.fit(X_train, y_train)
model.best_estimator_

LogisticRegression(C=0.01, class_weight='balanced')

In [None]:
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.862235925765787
Test ROCAUC 0.8506773578028716


#### 1.3.1.2 MIRROR BERT (Random Forest)

In [None]:
model = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'n_estimators':[10, 50, 100, 200], 'max_depth':[2,5,8]}, cv=4)

model.fit(X_train, y_train)
model.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=8, n_estimators=200)

In [None]:
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.8959353368119252
Test ROCAUC 0.8077762582883882


#### 1.3.1.3 MIRROR BERT (XGBoost)

In [None]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['eval_metric'] = 'auc'
dtrain = xgb.DMatrix(X_train, label=y_train)
model = xgb.train(param, dtrain)

In [None]:
dtest = xgb.DMatrix(X_test)
pred = model.predict(dtest)

pred = model.predict(dtrain)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(dtest)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.8684589793425377
Test ROCAUC 0.8568329510798618


### 1.3.2 Universal Sentence Encoder (USE)

In [None]:
#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [None]:
X = np.zeros((data.shape[0], 512))
for i, text in enumerate(data['Text']):
    X[i] = embed([text]).numpy()
    # if i%200==0:
    #     print(i)
y = data['y'].values

In [None]:
del model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

#### 1.3.2.1 USE (Logistic regression)

In [None]:
model = GridSearchCV(estimator=LogisticRegression(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'C':[10**x for x in range(-4, 5)]}, cv=4)

model.fit(X_train, y_train)
model.best_estimator_

In [None]:
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.8752924829673535
Test ROCAUC 0.8669178974032348


#### 1.3.2.2 USE (Random Forest regression)

In [None]:
model = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'n_estimators':[10, 50, 100, 200], 'max_depth':[2,5,8]}, cv=4)

model.fit(X_train, y_train)
model.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=8, n_estimators=200)

In [None]:
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.9093894273756502
Test ROCAUC 0.8328647779590755


#### 1.3.2.3 USE (XGBOOST)

In [None]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['eval_metric'] = 'auc'
dtrain = xgb.DMatrix(X_train, label=y_train)
model = xgb.train(param, dtrain)

dtest = xgb.DMatrix(X_test)
pred = model.predict(dtest)

pred = model.predict(dtrain)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(dtest)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.911033208937994
Test ROCAUC 0.9039553543671863


### 1.3.3 Sentence BERT

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def embed(sentences):
    return model.encode(sentences)

In [None]:
X = embed(data['Text'])
y = data['y'].values
del model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

#### 1.3.3.1 Sentence BERT (Logistic regression)

In [None]:
model = GridSearchCV(estimator=LogisticRegression(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'C':[10**x for x in range(-4, 5)]}, cv=4)

model.fit(X_train, y_train)

In [None]:
print(model.best_estimator_)
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

LogisticRegression(C=1, class_weight='balanced')
Train ROCAUC:  0.8610524769224956
Test ROCAUC 0.8587497836222745


#### 1.3.3.2 Sentence BERT (Random Forest)

In [None]:
model = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'n_estimators':[10, 50, 100, 200], 'max_depth':[2,5,8]}, cv=4)

model.fit(X_train, y_train)
model.best_estimator_

pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.8940306188843039
Test ROCAUC 0.796350185378813


#### 1.3.3.3 Sentence BERT (XGBOOST)

In [None]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['eval_metric'] = 'auc'
dtrain = xgb.DMatrix(X_train, label=y_train)
model = xgb.train(param, dtrain)

dtest = xgb.DMatrix(X_test)
pred = model.predict(dtest)

pred = model.predict(dtrain)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(dtest)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.8663343864259357
Test ROCAUC 0.8521356578419995


<tr>
    <th> Embedding </th>
    <th> Algorithm </th>
    <th> ROCAUC </th>
</tr>
<tr>
    <td> MIRROR BERT </td>
    <td> Logistic Regression </td>
    <td> 0.8507 </td>
</tr>
<tr>
    <td> MIRROR BERT </td>
    <td> Random Forest </td>
    <td> 0.8078 </td>
</tr>
<tr>
    <td> MIRROR BERT </td>
    <td> XGBOOST </td>
    <td> 0.8568 </td>
</tr>

<tr>
    <td> USE </td>
    <td> Logistic Regression </td>
    <td> 0.867 </td>
</tr>
<tr>
    <td> USE </td>
    <td> Random Forest </td>
    <td> 0.833 </td>
</tr>
<tr>
    <td> USE </td>
    <td> XGBOOST </td>
    <td> 0.9039 </td>
</tr>
<tr>
    <td> Sentence BERT </td>
    <td> Logistic Regression </td>
    <td> 0.8587 </td>
</tr>
<tr>
    <td> Sentence BERT </td>
    <td> Random Forest </td>
    <td>  0.7964 </td>
</tr>
<tr>
    <td> Sentence BERT </td>
    <td> XGBOOST </td>
    <td> 0.8521 </td>
</tr>