In [None]:
!pip install transformers

# Importing Modules and Datasets

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
!gdown --id 1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
!gdown --id 1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv

In [None]:
df = pd.read_csv('traindata.csv',encoding = "latin")
df2 = {'0':0,'1467810369':'1467810369','Mon Apr 06 22:19:45 PDT 2009':'Mon Apr 06 22:19:45 PDT 2009','NO_QUERY':'NO_QUERY','_TheSpecialOne_':'_TheSpecialOne_',"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D":"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"}
df = df.append(df2, ignore_index = True) 
df.columns=['target','id','date','flag','user','text']
df = df.drop(columns=['id','date','flag','user'])
df

In [None]:
df.shape

In [None]:
df.iloc[0].text

In [None]:
df.info()

In [None]:
sns.countplot(df.target)
plt.xlabel('review score')

In [None]:
def to_sentiment(rating) :
  rating = int(rating) 
  if rating == 0 :
    return 0
  return 1

In [None]:
df['sentiment'] = df['target'].apply(to_sentiment)
df.head()

In [None]:
class_name = ['neutral','positive']
ax = sns.countplot(df.sentiment)
ax.set_xticklabels(class_name)

In [None]:
df = df.drop(columns=['target'])
df.columns = ['content', 'sentiment']
df.head()

#Data Preprocessing

In [None]:
# tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = transformers.BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
sample_text = 'When was i last outside? I am stuck at home for 2 weeks.'

##Word Tokenizer

In [None]:
tokens = tokenizer.tokenize(sample_text)
print(tokens)

In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f"{sample_text} \n{tokens} \n{token_ids}")

##Special Tokens

 A special token separating two different sentences in the same input



In [None]:
tokenizer.sep_token , tokenizer.sep_token_id

A special token representing the class of the input 

In [None]:
tokenizer.cls_token , tokenizer.cls_token_id

A special token used to make arrays of tokens the same size for batching purpose




In [None]:
tokenizer.pad_token , tokenizer.pad_token_id

A special token representing an out-of-vocabulary token


In [None]:
tokenizer.unk_token , tokenizer.unk_token_id

In [None]:
encoding = tokenizer.encode_plus(
    sample_text,
    max_length=32,
    add_special_tokens = True,
    padding='max_length',
    return_attention_mask=True,
    return_token_type_ids=False,
    return_tensors='pt'
)
encoding.keys()

In [None]:
encoding['input_ids'],len(encoding['input_ids'][0])

In [None]:
encoding['attention_mask'],len(encoding['attention_mask'][0])

## Choosing sequence length

In [None]:
token_lens = []
for txt in df.content :
  tokens = tokenizer.encode(txt,max_length=512,truncation=True)
  token_lens.append(len(tokens))

In [None]:
sns.displot(token_lens)

Create PyTorch Dataset

In [None]:
class GPReviewData(Dataset) :
  def __init__(self,reviews,targets,tokenizer,max_len) :
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self) :
    return len(self.reviews)

  def __getitem__(self,item) :
    review = str(self.reviews[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      review,
      max_length=self.max_len,
      add_special_tokens = True,
      truncation=True,
      padding='max_length',
      return_attention_mask=True,
      return_token_type_ids=False,
      return_tensors='pt')
    return {
        'review_text':review,
      'input_ids':encoding['input_ids'].flatten(),
      'attention_mask':encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target,dtype=torch.long)
    }

In [None]:
MAX_LEN = 280
BATCH_SIZE = 8
EPOCHS = 50

In [None]:
df_train,df_test = train_test_split(df,test_size=0.1,random_state=RANDOM_SEED)
df_val,df_test = train_test_split(df_test,test_size=0.5,random_state=RANDOM_SEED)

In [None]:
df_train.shape,df_val.shape,df_test.shape

In [None]:
def create_data_loader(df,tokenizer,max_len,batch_size) :
  ds = GPReviewData(
      reviews=df.content.to_numpy(),
      targets=df.sentiment.to_numpy(),
      tokenizer=tokenizer,
      max_len=max_len
    )
  
  return DataLoader(
      ds,
      batch_size=batch_size,
      num_workers=2
  )

In [None]:
train_data_loader = create_data_loader(df_train,tokenizer,MAX_LEN,BATCH_SIZE)
val_data_loader = create_data_loader(df_val,tokenizer,MAX_LEN,BATCH_SIZE)
test_data_loader = create_data_loader(df_test,tokenizer,MAX_LEN,BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
data.keys()

In [None]:
data['input_ids'].shape,data['attention_mask'].shape,data['targets'].shape

#Text Classification

In [None]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
x = bert_model(
  input_ids=encoding['input_ids'],
  attention_mask=encoding['attention_mask']
)

In [None]:
x['last_hidden_state'].shape , x['pooler_output'].shape

**Building Sentimental Classifier**

In [None]:
class SentimentClassifier(nn.Module) :
  def __init__(self,n_classes) :
    super(SentimentClassifier,self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size,n_classes)
    self.softmax = nn.Softmax(dim=1)

  def forward(self,input_ids,attention_mask) :
    x = self.bert(
        input_ids = input_ids,
        attention_mask=attention_mask
    )
    pooled_output = x['pooler_output']
    output = self.drop(pooled_output)
    output = self.out(output)

    return self.softmax(output)

In [None]:
model = SentimentClassifier(len(class_name))
model = model.to(device)

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape)
print(attention_mask.shape)

In [None]:
model(input_ids,attention_mask)

###Training the model

In [None]:
EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);


In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)
test_acc.item()

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["review_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

In [None]:
class_name = class_names

In [None]:
print(classification_report(y_test, y_pred, target_names=class_names))


In [None]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment');
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

In [None]:
idx = 2
review_text = y_review_texts[idx]
true_sentiment = y_test[idx]
pred_df = pd.DataFrame({
  'class_names': class_names,
  'values': y_pred_probs[idx]
})
print("\n".join(wrap(review_text)))
print()
print(f'True sentiment: {class_names[true_sentiment]}')

In [None]:
sns.barplot(x='values', y='class_names', data=pred_df, orient='h')
plt.ylabel('sentiment')
plt.xlabel('probability')
plt.xlim([0, 1]);

In [None]:
review_text = "I love completing my todos! Best app ever!!!"


In [None]:
encoded_review = tokenizer.encode_plus(
  review_text,
  max_length=MAX_LEN,
  add_special_tokens=True,
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',
)

In [None]:
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
print(f'Review text: {review_text}')
print(f'Sentiment  : {class_names[prediction]}')