# Initialization

In [1]:
import numpy as np
import pandas as pd

import torch
import transformers

from tqdm.auto import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Global Variables

In [2]:
max_sample_size = 200

# Loading Data

In [3]:
df_reviews = pd.read_csv('datasets/imdb_reviews_small_lemm_train.tsv', sep='	')

In [4]:
df_reviews.head()

Unnamed: 0,tconst,original_title,review,review_lemm,pos
0,tt0087803,Nineteen Eighty-Four,I saw this movie last year in Media class and ...,i see this movie last year in medium class and...,0
1,tt0087803,Nineteen Eighty-Four,"I must admit, there are few books with corresp...",i must admit there be few book with correspond...,0
2,tt0087803,Nineteen Eighty-Four,I think that the shots and lighting were very ...,i think that the shot and light be very poor w...,0
3,tt0087803,Nineteen Eighty-Four,"A few weeks ago, I read the classic George Orw...",a few week ago i read the classic george orwel...,0
4,tt0087803,Nineteen Eighty-Four,I saw this movie literally directly after fini...,i see this movie literally directly after fini...,0


# Preprocessing for BERT

In [6]:
df_reviews.iloc[:max_sample_size]

Unnamed: 0,tconst,original_title,review,review_lemm,pos
0,tt0087803,Nineteen Eighty-Four,I saw this movie last year in Media class and ...,i see this movie last year in medium class and...,0
1,tt0087803,Nineteen Eighty-Four,"I must admit, there are few books with corresp...",i must admit there be few book with correspond...,0
2,tt0087803,Nineteen Eighty-Four,I think that the shots and lighting were very ...,i think that the shot and light be very poor w...,0
3,tt0087803,Nineteen Eighty-Four,"A few weeks ago, I read the classic George Orw...",a few week ago i read the classic george orwel...,0
4,tt0087803,Nineteen Eighty-Four,I saw this movie literally directly after fini...,i see this movie literally directly after fini...,0
...,...,...,...,...,...
195,tt0364986,Ben & Arthur,I don't know what it is about this movie- dire...,i do not know what -PRON- be about this movie ...,0
196,tt0364986,Ben & Arthur,"Oh man, what was Sam Mraovich thinking? What w...",oh man what be sam mraovich think what be anyo...,0
197,tt0364986,Ben & Arthur,Just watched this after hearing about how bad ...,just watch this after hear about how bad -PRON...,0
198,tt0364986,Ben & Arthur,"This movie is just truly awful, the eye-candy ...",this movie be just truly awful the eye candy t...,0


In [9]:
for i in range(5):
    print(i)

0
1
2
3
4


In [4]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

ids_list = []
attention_mask_list = []

max_length = 512

for input_text in df_reviews.iloc[:max_sample_size]['review']:
    ids = tokenizer.encode(input_text.lower(), add_special_tokens=True, truncation=True, max_length=max_length)
    padded = np.array(ids + [0]*(max_length - len(ids)))
    attention_mask = np.where(padded != 0, 1, 0)
    ids_list.append(padded)
    attention_mask_list.append(attention_mask)

# Getting Embeddings

In [5]:
config = transformers.BertConfig.from_pretrained('bert-base-uncased')
model = transformers.BertModel.from_pretrained('bert-base-uncased')

In [6]:
batch_size = 25    # typically the batch size is equal to 100 but we can set it to lower values to lower the memory requirements

embeddings = []

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using the {device} device.')
model.to(device)

for i in tqdm(range(len(ids_list) // batch_size)):
    
    ids_batch = torch.LongTensor(ids_list[batch_size*i:batch_size*(i+1)]).to(device)
    attention_mask_batch = torch.LongTensor(attention_mask_list[batch_size*i:batch_size*(i+1)]).to(device)

    with torch.no_grad():
        model.eval()
        batch_embeddings = model(ids_batch, attention_mask=attention_mask_batch)

    embeddings.append(batch_embeddings[0][:,0,:].detach().cpu().numpy())

Using the cpu device.


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




# Model

In [7]:
features = np.concatenate(embeddings)
target = df_reviews.iloc[:max_sample_size]['pos']

print(features.shape)
print(target.shape)

(200, 768)
(200,)


In [8]:
features_train, features_test, target_train, target_test = train_test_split(features,
                                                                            target, 
                                                                            test_size=0.5, random_state=0)

In [17]:
# train and test your model
# < put your code here >
model = LogisticRegression(random_state=0, solver='liblinear')
model.fit(features_train, target_train)

#pred = model.predict(features_test)

scores = cross_val_score(model,features_train, target_train, cv=3)
print(scores)
print(model.score(features_test, target_test))

[0.82352941 0.78787879 0.81818182]
