In [None]:
import torch

import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
!pip install transformers==3

from transformers import BertTokenizer

Collecting transformers==3
[?25l  Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)
[K     |▍                               | 10kB 23.1MB/s eta 0:00:01[K     |▉                               | 20kB 29.2MB/s eta 0:00:01[K     |█▎                              | 30kB 34.7MB/s eta 0:00:01[K     |█▊                              | 40kB 35.4MB/s eta 0:00:01[K     |██▏                             | 51kB 33.3MB/s eta 0:00:01[K     |██▋                             | 61kB 35.0MB/s eta 0:00:01[K     |███                             | 71kB 27.1MB/s eta 0:00:01[K     |███▌                            | 81kB 25.6MB/s eta 0:00:01[K     |████                            | 92kB 26.8MB/s eta 0:00:01[K     |████▍                           | 102kB 25.6MB/s eta 0:00:01[K     |████▊                           | 112kB 25.6MB/s eta 0:00:01[K     |█████▏                         

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

import os
os.chdir('/content/drive/MyDrive')
!ls


In [None]:
cd CZ4034/

/content/drive/MyDrive/CZ4034


In [None]:
import pandas as pd

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
len(tokenizer.vocab)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




28996

In [None]:
from torch.utils.data import Dataset, DataLoader
class ReviewDataset(Dataset):
  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
    
  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
      truncation=True
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
import numpy as np
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)


<torch._C.Generator at 0x7f49d4a538f0>

In [None]:
df_train = pd.read_csv("shared_train_set.csv")
df_test = pd.read_csv("shared_test_set.csv")



In [None]:
# df_train_neutral = df_train[df_train['label'] == 0]
# df_train_pos = df_train[df_train['label'] == 1]
# df_train_neg = df_train[df_train['label'] == 2]
# df_train_neutral.shape, df_train_pos.shape, df_train_neg.shape

In [None]:
df_test_neutral = df_test[df_test['label'] == 0]
df_test_pos = df_test[df_test['label'] == 1]
df_test_neg = df_test[df_test['label'] == 2]
df_test_neutral.shape, df_test_pos.shape, df_test_neg.shape

((110, 2), (94, 2), (94, 2))

In [None]:
# df_train_polarity = pd.concat([df_train_pos,df_train_neg]).sample(390)

In [None]:
# df_train = pd.concat([df_train_polarity, df_train_neutral]).sample(frac = 1)
# df_test = pd.concat([df_test_polarity, df_test_neutral]).sample(frac = 1)

In [None]:
def convert_numerical(x):
    if x==0:
      return 0
    return 1

# df_train['label'] = df_train['label'].apply(convert_numerical)
df_test_sub = df_test.copy()
df_test_sub['label'] = df_test['label'].apply(convert_numerical)
# df_train.label.value_counts()

In [None]:
df_test_sub.label.value_counts()

1    188
0    110
Name: label, dtype: int64

In [None]:
# from sklearn.model_selection import train_test_split
# df_train, df_val = train_test_split(
#   df_train,
#   test_size=0.05,
#   random_state=RANDOM_SEED
# )

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = ReviewDataset(
    reviews=df['text'].to_numpy(),
    targets=df['label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2
  )

from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch import nn, optim

class SentimentClassifier(nn.Module):
  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.2)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)


In [None]:
model_subjectivity = SentimentClassifier(2)
model_subjectivity.load_state_dict(torch.load('best_model_state_subjectivity_train_shared_0.68.bin'))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_subjectivity.to(device)
model_subjectivity.eval()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




SentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [None]:

MAX_LEN = 250
BATCH_SIZE = 4
loss_fn = nn.CrossEntropyLoss().to(device)
test_data_loader = create_data_loader(df_test_sub, tokenizer, MAX_LEN, BATCH_SIZE)
test_acc, _ = eval_model(
  model_subjectivity,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

In [None]:
test_acc.item()

0.6879194630872483

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["review_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values

In [None]:
import datetime
a = datetime.datetime.now()

y_review_texts, y_pred_sub, y_pred_probs, y_test_sub = get_predictions(
  model_subjectivity,
  test_data_loader
)

from sklearn.metrics import confusion_matrix, classification_report
class_names = ['neutral', 'non-neutral']
print(classification_report(y_test_sub, y_pred_sub, target_names=class_names,digits=3))

b = datetime.datetime.now()
b-a

              precision    recall  f1-score   support

     neutral      0.570     0.627     0.597       110
 non-neutral      0.768     0.723     0.745       188

    accuracy                          0.688       298
   macro avg      0.669     0.675     0.671       298
weighted avg      0.695     0.688     0.691       298



datetime.timedelta(seconds=4, microseconds=743758)

In [None]:
polarised_index = [True if i == 1 else False for i in np.asarray(y_pred_sub)]

In [None]:
polarised_df_test = df_test[polarised_index]

In [None]:
polarised_df_test.label.value_counts()

1    70
2    66
0    41
Name: label, dtype: int64

In [None]:
test_data_loader_pol = create_data_loader(polarised_df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
polarised_df_test

Unnamed: 0,text,label
1,"i enjoyed working here, teamwork environment,lots of help, nice balance of work /family life.good pto, medical,401k benefits. steady raises, no glass ceiling",1
3,Work-life balanceGood group of people to work with. Respectful and friendly.Pay & benefitsGood health insurance and 401kJob security and advancementIf you are hardworking and you are honest you will have a job!OverallRetired after 35+ years with this great company! Would not have been able to retire without the pay and benefits this company offers. It's hard work but rewarding!,1
5,"Work sites was interesting, me great people that gave me a broader look on different aspects of careers and life. Good for part-time income but that about it.",1
7,"The team at the location I work, is amazing, supportive, and works well TOGETHER! * This is a VERY important key to a great work environment *We learn from each other; We make work enjoyable, even on the bad days of cranky customers-- which our not-so crankies outweigh our crankies! Work-Life Balance is great! We all understand ""life happens"" and we are willing to work with what comes our way-- Support!",1
8,I started working here at 16 and after passing drug and background checks they scheduled me to train which they ended up paying me $1200 to do so. Summer and Christmas bonuses are definitely a plus as well as having a very laid back environment. Anyone can make friends here and it makes work pass by quicker.,0
...,...,...
290,I love the company and what they stand for. It's a hard job but we should all work hard to better ourselves. Work life balance is always a challenge but would rather be busy than not.,0
291,Working for this company is the best move ever. They care about their staff and look to motivate you. Competitive salaries and staff stay happy. Corporate is engaged with management so the communication is awesome!,1
292,The work itself is easy just not a lot of available hours people you work with are nice. Nice management. The pay is decent but like I said not a lot of available hours,0
293,"I enjoyed getting up and going into work. Management all got along well and worked toward the one goal and that was for the betterment of the company. Crossover, Inc. treated all their employees with respect. Their benefits are adequate. The owners were incredible people to work for. They were very personable and cared about all their employees.",1


In [None]:
model_polarity = SentimentClassifier(2)
model_polarity.load_state_dict(torch.load('best_model_state_polarity0.95.bin'))
model_polarity.to(device)
model_polarity.eval()

SentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [None]:
import datetime
a = datetime.datetime.now()

y_review_texts, y_pred_pol, y_pred_probs, y_test_pol = get_predictions(
  model_polarity,
  test_data_loader_pol
)

b = datetime.datetime.now()
b-a

datetime.timedelta(seconds=2, microseconds=984883)

In [None]:
polarity_pred = [1 if i == 1 else 2 for i in y_pred_pol]

In [None]:
j = 0
res = []
for i in y_pred_sub:
  if i == 1:
    res.append(polarity_pred[j])
    j += 1
  else:
    res.append(0)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
class_names = ['neutral', 'positive','negative']
print(classification_report(df_test.label, res, target_names=class_names,digits=3))

              precision    recall  f1-score   support

     neutral      0.570     0.627     0.597       110
    positive      0.717     0.702     0.710        94
    negative      0.729     0.660     0.693        94

    accuracy                          0.661       298
   macro avg      0.672     0.663     0.667       298
weighted avg      0.667     0.661     0.663       298



In [None]:
polarised_index_final = [True if i != 0 else False for i in np.asarray(res)]

In [None]:
count = 0
for i in range(len(res)):
  if res[i] == df_test.label.iloc[i]:
    count += 1
count/len(res)

In [None]:
res = pd.Series(res)

In [None]:
res.to_csv("multitask-pipeline-results.csv",index=False)