In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!pip install transformers
!pip install emoji
!pip install datasets

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, BertConfig, AdamW, get_linear_schedule_with_warmup, pipeline, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import torch
import emoji
import numpy as np
import pandas as pd
import seaborn as sns
from datasets import load_dataset
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf
import tensorflow_datasets as tfds
import statistics
import re
import os
import random
from pprint import pprint

# %reload_ext watermark
# %watermark -v -p numpy,pandas,torch,transformers

%matplotlib inline
%config InlineBackend.figure_format='retina'



In [None]:
from tensorflow.python.client import device_lib
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.get_device_name()



**Data Preprocessing**

In [None]:
# Preprocessing

def text_preprocessing(s):
  # Remove url's
  s = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])', ' ',s)

  # Change 't to 'not'
  s = re.sub(r"\'t", " not", s)
  # Replace '&amp;' with '&'
  s = re.sub(r'&amp;', '&', s)
  # Remove retweet indicator RT
  s = re.sub(r'(RT?)[\s]',' ',s)
  # Change emoji to ascii
  s = emoji.demojize(s, delimiters=("", ""))
  # Remove \n 
  s = re.sub(r'\n[\s]',' ',s)
  # Remov apostrophe 
  s = re.sub(r"\'", '', s)
  # Remove punctuations 
  s = re.sub(r'([\'\"\(\)\\\/\#\â\€\”\@])', r'', s)
  s = re.sub(r'([\¬\‡\ï\¸\\¬\‡\ï])', r'', s)

  # Remove trailing whitespace
  s = re.sub(r'\s+', ' ', s).strip()
  return s

**Sentiment140 dataset for training**

In [None]:
# from datasets import load_dataset
from pprint import pprint
dataset = load_dataset('sentiment140', split='train')
pprint(dataset[0])


In [None]:
#reduce dataset
df = pd.DataFrame(dataset) 
sentiment140 = df.sample(n = 100000, random_state=25)
sentiment140.reset_index(drop=True,inplace=True)
sentiment140.loc[(sentiment140.sentiment == 4),'sentiment']= 1
# sentiment140.sentiment.unique()

In [None]:
# # Sentiment141 - add labelled COVID-19 tweets
# ## Combine reduced Sentiment140 with labelled covid tweets
# # print('sentiment140 ',sentiment140.info())
# labelled_covid = pd.read_csv("/content/labelled_tweets.csv")
# # print('labelled_covid ',labelled_covid.info())
# sentiment141 = pd.merge(sentiment140, labelled_covid, how='outer') #left_on=['date'], right_on=['date'],how='outer')
# sentiment141.info()

In [None]:
# how balanced is the reduced dataset
p = sentiment140.apply(lambda x: True if x['sentiment'] == 1 else False , axis=1)
n = sentiment140.apply(lambda x: True if x['sentiment'] == 0 else False , axis=1)
pRows = len(p[p == True].index)
nRows = len(n[n == True].index)
print('Positive: ', pRows)
print('Negative: ', nRows) 

In [None]:
# sentiment140 for clean runs

clean_samples = pd.DataFrame()
clean_samples['label'] = sentiment140.sentiment #sentiment141.sentiment
for i in range(len(sentiment140)): #range(len(sentiment141))
  dirty_txt = sentiment140.at[i, 'text'] #sentiment141.at[i, 'text']
  # print(dirty_txt)
  clean_txt = text_preprocessing(dirty_txt)
  # print(clean_txt)
  clean_samples.at[i, 'text'] = clean_txt
# clean_samples.head()

In [None]:
# # Sentiment140/1 unclean runs
# sentiment140 = sentiment140.rename(columns = {'sentiment': 'label'}) #sentiment141 = sentiment141.rename(columns = {'sentiment': 'label'})
# clean_samples = sentiment140 #sentiment140

In [None]:
#determine length of sequences and set a fixed length for sample tweets


tokenizer = BertTokenizer.from_pretrained('bert-base-cased')##### changed model
token_lens = []
for txt in clean_samples.text:
  tokens = tokenizer.encode(txt, max_length=512,truncation=True)
  token_lens.append(len(tokens))

print(max(token_lens))
MAX_LEN = (max(token_lens))


In [None]:


df_train, df_val = train_test_split(
  clean_samples,
  test_size=0.3,
  random_state=25
)
df_val, df_test = train_test_split(
  df_val,
  test_size=0.3,
  random_state=25
)

In [None]:
print('train: ',df_train.shape)
print('validation: ',df_val.shape)
print('test: ',df_test.shape)

In [None]:
class TwitterDataset(Dataset):
  
  def __init__(self, text, label, tokenizer, max_len):
    self.text = text
    self.label = label
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.text)
  
  def __getitem__(self, item):
    text = str(self.text[item])
    label = self.label[item]
    
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      # pad_to_max_length=True,
      # padding=True,
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'label': torch.tensor(label, dtype=torch.long)
    }

In [None]:
# Create a data loader


def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = TwitterDataset(
    text=df.text.to_numpy(),
    label=df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
    # padding=True
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN,  BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN ,BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
data.keys()

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['label'].shape)


In [None]:
class SentimentClassifier(nn.Module):
  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-cased')
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:

class_names = ['0', '1'] #['Negative', 'Positive'] #['0', '4'] #

model = SentimentClassifier(len(class_names))
model = model.to(device)

input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

In [None]:

m = nn.Softmax(dim=1)
input = model(input_ids, attention_mask)
output = m(input)
output

**Training**

In [None]:
EPOCHS = 2
# optimizer = AdamW(model.parameters(), lr=3e-5, correct_bias=False)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    label = d["label"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, label)
    correct_predictions += torch.sum(preds == label)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      label = d["label"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, label)
      correct_predictions += torch.sum(preds == label)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0.6, 1])


**Evaluation**

In [None]:
model = SentimentClassifier(len(class_names))
model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)

In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)
test_acc.item()

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  texts = []
  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      label = d["label"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(label)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

In [None]:
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment');
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

**COVID Tweets**

In [None]:
# Merge all tweet csv files to one
# import os
# import glob
# import pandas as pd
# os.chdir("/content/drive/My Drive/Dissertation/tweets/")
# extension = 'csv'
# all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
# #combine all files in the list
# combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
# #export to csv
# combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')
# combined_csv.info()

In [None]:
# read in all covid tweets
tweets = pd.read_csv("/content/combined_csv.csv") 
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
print(tweets.tweet_text.tail())
tweets.info()


In [None]:
# clean up covid tweets for 'clean' runs

for i in range(len(tweets)):
  dirty_txt = tweets.at[i, 'full_text']
  clean_txt = text_preprocessing(dirty_txt)
  tweets.at[i, 'clean_text'] = clean_txt
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
tweets.head()

In [None]:
# # for unclean run rename full_text to clean_text
# tweets = tweets.rename(columns = {'full_text': 'clean_text'})

In [None]:
#determine length of sequences and set a fixed length

tokenizer = BertTokenizer.from_pretrained('bert-base-cased') 
token_lens = []
for txt in tweets.clean_text:
  tokens = tokenizer.encode(txt, max_length=512,truncation=True)
  token_lens.append(len(tokens))

print(max(token_lens))
MAX_LEN = (max(token_lens))



In [None]:
# Predicting COVID tweets dataset

pred_tweets = tweets

for i in range(len(pred_tweets)):
  encoded_test = tokenizer.encode_plus(
    pred_tweets.at[i,'clean_text'],
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
  )
  input_ids = encoded_test['input_ids'].to(device)
  attention_mask = encoded_test['attention_mask'].to(device)
  label = torch.tensor([1]) #.unsqueeze(0)
  output = model(input_ids, attention_mask)
  _, pred_labels = torch.max(output, dim=1)
  sm = torch.nn.Softmax(dim=1)
  probabilities = sm(output) 
  prob = probabilities[0]
  prob = prob.detach().cpu().numpy()
  probs = prob.max()
  pred_tweets.at[i,'pred_labels'] = class_names[pred_labels]
  pred_tweets.at[i,'probability'] = probs

print(pred_tweets.head())


In [None]:
# Plotting Positive and Negative tweet probabilities
neg = list(pred_tweets[pred_tweets['pred_labels'] =='0']['probability'])
pos = list(pred_tweets[pred_tweets['pred_labels'] =='1']['probability'])
labels_names = ['0','1']
plt.hist([neg, pos],label = labels_names)
# Plot formatting
plt.legend()
plt.xlabel('Prediction Probability')
plt.ylabel('Count of Tweets')
plt.title('Histogram of probabilities of positive and negative tweets')


In [None]:
to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S')
pred_tweets.to_csv(path_or_buf = r'pred_tweets.csv', index=False)
# pred_tweets.to_csv(path_or_buf = r"/content/combined_preds_" + to_csv_timestamp +'.csv', index=False) # make sure drive is mounted!!

In [None]:
# Numbers of positive and negative predictions
p = pred_tweets.apply(lambda x: True if x['pred_labels'] == '1' else False , axis=1)
n = pred_tweets.apply(lambda x: True if x['pred_labels'] == '0' else False , axis=1)
pRows = len(p[p == True].index)
nRows = len(n[n == True].index)
print('Number tweets predicted as positive: ', pRows)
print('Number tweets predicted as negative: ', nRows)

In [None]:
print(pred_tweets.info())

In [None]:
# pred_tweets = pd.read_csv("/content/combined_preds.csv")
pred_cases = pd.read_csv("/content/data_2020-Nov-04.csv")

pred_cases.drop(columns=['areaType','areaName','areaCode','cumCasesBySpecimenDate'],inplace=True)
pred_cases = pred_cases.rename(columns = {'newCasesBySpecimenDate': 'CasesPerDay'})
pred_cases = pred_cases[pred_cases.date > '2020-10-15']
pred_cases = pred_cases[pred_cases.date != '2020-11-03']
pred_cases = pred_cases.sort_values(by='date').reset_index(drop=True)
print(pred_cases)

In [None]:
# # print(pred_cases.info())
# # pred_cases.drop(columns=['areaType','areaName','areaCode','cumCasesBySpecimenDate'],inplace=True)
# # pred_cases.date  # pd.to_datetime
# # pred_cases['date'] = pd.to_datetime(pred_cases['date']) 
# # date_obj = datetime.strptime('2020-10-15', '%Y-%m-%d')
# # print(type(date_obj), pred_cases.dtypes)
# # mydate = pd.to_datetime('2020-10-15', format='%Y-%m-%d')
# # print(mydate)
# pred_cases = pred_cases[pred_cases.date > '2020-10-15']
# # print(pred_cases)
# pred_cases = pred_cases.sort_values(by='date').reset_index(drop=True)
# print(pred_cases)

In [None]:
# summarise total number of positive and negative predictions per day and combine with cases

pred_tweets['created_at'] = pd.to_datetime(pred_tweets['created_at'])
pred_tweets['date'] = pred_tweets['created_at'].dt.strftime('%Y-%m-%d')

pred_summary = pred_tweets[['date','pred_labels','created_at']].groupby(['date','pred_labels'],as_index=False).count()

for i in range(len(pred_summary)):
  pred_date = pred_summary.at[i,'date']
  pred_label = pred_summary.at[i,'pred_labels']
  pred_obj = pred_summary[(pred_summary['date']==pred_date)] # & (pred_summary['pred_labels']==pred_label)]
  print('pred_obj    ',pred_obj)
  pred_index = pred_cases[pred_cases['date']==pred_date].index[0]
  if pred_obj.at[i, 'pred_labels']==1:
    pred_cases.at[pred_index, 'positive_tweets']=pred_obj.at[i,'created_at']
  elif (pred_obj.at[i, 'pred_labels']==0):
    pred_cases.at[pred_index, 'negative_tweets']=pred_obj.at[i, 'created_at']
    # print('********elif*******',pred_cases.at[pred_index])

print(pred_cases)

# print(pred_figures.info())

In [None]:
# Plot summary tweets/cases

pred_cases.plot(x='date', y=['CasesPerDay', 'negative_tweets','positive_tweets'])
