In [None]:
!pip install wandb
!git clone https://github.com/cambridgeltl/mirror-bert.git
!mv mirror-bert mirror
!pip install pytorch-metric-learning
!pip install transformers
!pip install -U sentence-transformers

In [None]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn import metrics

# Load Data

In [None]:
data = pd.read_csv("Product_review.csv", encoding= 'unicode_escape')
data


Unnamed: 0,Review,Y
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,emerges as something rare,1
...,...,...
10657,a terrible movie that some people will neverth...,0
10658,there are many definitions of 'time waster' bu...,0
10659,as it stands,0
10660,the thing looks like a made-for-home-video qui...,0


# Data Cleaning

In [None]:
data.rename(columns={'is_duplicate':'Y'}, inplace=True)
data.drop_duplicates(subset=['Review'], inplace=True)
data['Y'].value_counts()

1    5244
0    5218
Name: Y, dtype: int64

# Mirror Bert

In [None]:
from mirror.src.mirror_bert import MirrorBERT

In [None]:
MirrorBERT()

<mirror.src.mirror_bert.MirrorBERT at 0x7fb199f90610>

In [None]:
def embed(data, model_name = "cambridgeltl/mirror-roberta-base-sentence-drophead"):
    mirror_bert = MirrorBERT()
    mirror_bert.load_model(path=model_name, use_cuda=True)
    embeddings = mirror_bert.get_embeddings(data, agg_mode="cls").cpu().detach().numpy()
    return embeddings

In [None]:
x = embed(['Hello', "World", "Sentence 3"])

Downloading:   0%|          | 0.00/278 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

100%|██████████| 1/1 [00:00<00:00,  5.75it/s]


In [None]:
x.shape

(3, 768)

In [None]:
x

array([[-0.09157083,  0.4965632 ,  0.07892643, ...,  0.1145598 ,
         0.07401161,  0.24017267],
       [ 0.30552158,  0.07689277,  0.70600814, ..., -0.04096836,
         0.11860064,  0.08503576],
       [-0.1520756 ,  0.02238294, -0.16625679, ...,  0.70819867,
         0.30312812,  0.27532536]], dtype=float32)

In [None]:
def embed(data, model_name = "cambridgeltl/mirror-roberta-base-sentence-drophead"):
    mirror_bert = MirrorBERT()
    mirror_bert.load_model(path=model_name, use_cuda=True)
    embeddings = mirror_bert.get_embeddings(data, agg_mode="cls").cpu().detach().numpy()
    return embeddings

In [None]:
q1 = embed(data['Review'])
X = np.hstack((q1))
y = data['Y'].values

100%|██████████| 11/11 [01:00<00:00,  5.53s/it]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(q1, y, test_size=0.33, random_state=0, stratify=y)

# Mirror Bert (Logistics Regression)

In [None]:
model = GridSearchCV(estimator=LogisticRegression(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'C':[10**x for x in range(-4, 5)]}, cv=4)

In [None]:

model.fit(X_train, y_train)
model.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogisticRegression(C=0.01, class_weight='balanced')

In [None]:
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.7536290916185127
Test ROCAUC 0.7275245220884989


# Mirror Bert (Random Forest)

In [None]:
model = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'n_estimators':[10, 50, 100, 200], 'max_depth':[2,5,8]}, cv=3)

model.fit(X_train, y_train)
model.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=8, n_estimators=200)

In [None]:
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.9553965053632112
Test ROCAUC 0.6852542386528098


In [None]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['eval_metric'] = 'auc'
dtrain = xgb.DMatrix(X_train, label=y_train)
model = xgb.train(param, dtrain)

# Mirror Bert (XGBoost)

In [None]:
dtest = xgb.DMatrix(X_test)
pred = model.predict(dtest)

pred = model.predict(dtrain)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(dtest)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.7757304757549761
Test ROCAUC 0.7121854265088826


# Universal Sentence Encoder (USE)

In [None]:
#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [None]:
q1 = np.zeros((data.shape[0], 512))
for i, text in enumerate(data['Review']):
    q1[i] = embed([text]).numpy()
    # if i%200==0:
    #     print(i)

y = data['Y'].values

In [None]:
del model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(q1, y, test_size=0.33, random_state=42, stratify=y)

# USE (Logistics Regression)

In [None]:
model = GridSearchCV(estimator=LogisticRegression(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'C':[10**x for x in range(-4, 5)]}, cv=4)

model.fit(X_train, y_train)
model.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogisticRegression(C=1, class_weight='balanced')

In [None]:
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.7131115972644267
Test ROCAUC 0.6973784060692798


# USE (Random Forest)

In [None]:
model = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'n_estimators':[10, 50, 100, 200], 'max_depth':[2,5,8]}, cv=4)

model.fit(X_train, y_train)
model.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=8, n_estimators=200)

In [None]:
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.9345424904294674
Test ROCAUC 0.6866260263246355


# USE (XGBOOST)

In [None]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['eval_metric'] = 'auc'
dtrain = xgb.DMatrix(X_train, label=y_train)
model = xgb.train(param, dtrain)

dtest = xgb.DMatrix(X_test)
pred = model.predict(dtest)

pred = model.predict(dtrain)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(dtest)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.7600482858373052
Test ROCAUC 0.7041375384043517


# Setence BERT

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def embed(sentences):
    return model.encode(sentences)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(q1, y, test_size=0.33, random_state=42, stratify=y)


# Sentence BERT (Logistics Regression)

In [None]:
model = GridSearchCV(estimator=LogisticRegression(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'C':[10**x for x in range(-4, 5)]}, cv=4)

model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

GridSearchCV(cv=4, estimator=LogisticRegression(class_weight='balanced'),
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000,
                               10000]},
             scoring='roc_auc')

In [None]:
print(model.best_estimator_)
pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

LogisticRegression(C=1, class_weight='balanced')
Train ROCAUC:  0.7131115972644267
Test ROCAUC 0.6973784060692798


# Sentence BERT (Random Forest)

In [None]:
model = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'), scoring='roc_auc',
                     param_grid = {'n_estimators':[10, 50, 100, 200], 'max_depth':[2,5,8]}, cv=4)

model.fit(X_train, y_train)
model.best_estimator_

pred = model.predict(X_train)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(X_test)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.9342626781467462
Test ROCAUC 0.6799718999913446


# Sentence BERT (XGBOOST)

In [None]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['eval_metric'] = 'auc'
dtrain = xgb.DMatrix(X_train, label=y_train)
model = xgb.train(param, dtrain)

dtest = xgb.DMatrix(X_test)
pred = model.predict(dtest)

pred = model.predict(dtrain)
print("Train ROCAUC: ", metrics.roc_auc_score(y_train, pred))
pred = model.predict(dtest)
print("Test ROCAUC", metrics.roc_auc_score(y_test, pred))

Train ROCAUC:  0.7600482858373052
Test ROCAUC 0.7041375384043517
