In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
import math
import torch.nn.functional as nnf
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from itertools import cycle
from sklearn.preprocessing import label_binarize
import torch
torch.manual_seed(0)
import random
random.seed(0)
np.random.seed(0)

def load_glove_model(File):                       #Load glove
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

nltk.download('punkt')
data = pd.read_csv("./vaccine_train_set.csv")     #Take the training set
X1 = data.drop('label', axis=1)
df_x = X1['tweet']    #keep the tweet
Y = data['label'] #Only keep value


df_val = pd.read_csv("./vaccine_validation_set.csv")    #Here just change the name of the file
X1_val = df_val.drop('label', axis=1)
df_x_val = X1_val['tweet']  #Keep the tweet
Y_val = df_val['label'] #Only keep value


!wget http://nlp.stanford.edu/data/glove.42B.300d.zip   #Download the corpus
!unzip glove.42B.300d.zip

num_embeddings = 300
model1 = load_glove_model("./glove.42B.300d.txt")   #Creat glove model

X = []
for tweet in df_x:      #For each tweet in dataset
  array = []
  tokens = nltk.word_tokenize(tweet)
  sentence = []
  num_words = len(tokens)
  for word in tokens:   #For each word in the tweet
    
    if word in model1:    #If it exists
      sentence.append(model1[word])
    else:                 #If it doesn't create an empty vector
      unknown = [0] * num_embeddings 
      sentence.append(unknown)
  new_sentence = []
  for embedding in range(num_embeddings):     #Find the average to have the same size in all tweets
    sum = 0
    for word in range(num_words):
      sum = sum + sentence[word][embedding]
    avg = sum/num_embeddings
    new_sentence.append(avg)
  X.append(new_sentence)

#Do the same for the validation file
X_val = []
for tweet in df_x_val:
  array = []
  tokens = nltk.word_tokenize(tweet)
  sentence = []
  num_words = len(tokens)
  for word in tokens:
    
    if word in model1:
      sentence.append(model1[word])
    else:
      unknown = [0] * num_embeddings
      sentence.append(unknown)
  new_sentence = []
  for embedding in range(num_embeddings):
    sum = 0
    for word in range(num_words):
      sum = sum + sentence[word][embedding]
    avg = sum/num_embeddings
    new_sentence.append(avg)
  X_val.append(new_sentence)
print(X_val)

class Net(nn.Module):     #Neural network
    def __init__(self, D_in, H1, H2, H3, D_out):
        super(Net, self).__init__()
        
        self.linear1 = nn.Linear(D_in, H1)
        self.linear2 = nn.Linear(H1, H2)
        self.linear3 = nn.Linear(H2, H3)
        self.linear4 = nn.Linear(H3, D_out)
        self.relu = nn.ReLU()
        self.logsimoid = nn.LogSigmoid()
        self.rrelu =nn.RReLU()
        self.logsoftmax = nn.LogSoftmax()
        
        
    def forward(self, x):
        h1 = self.linear1(x)
        #h1 = self.logsoftmax(h1)
        h2 = self.linear2(h1)
        h2 = self.relu(h2)
        h3 = self.linear3(h2)
        #h3 = self.rrelu(h3)
        out = self.linear4(h3)
        #out = self.rrelu(out)
        return out
        #return nnf.softmax(out)

#Save in tensors
x = torch.tensor(X, dtype=torch.float)
y = torch.tensor(Y, dtype=torch.float)

x_val = torch.tensor(X_val, dtype=torch.float)
y_val = torch.tensor(Y_val, dtype=torch.float)

y = y.type(torch.LongTensor)
y_val = y_val.type(torch.LongTensor)

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

#Create the neiral network
D_in = x.shape[1]
H1 = 128
H2 = 64
H3 = 32
#H4 = 16
D_out = 3

#Define Hyperparameters
#learning_rate = 1e-1
learning_rate = 1e-2

model = Net(D_in, H1, H2, H3,D_out)
#model = Net(D_in, H1, H2, H3, H4, D_out)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) #optimizer

loss_func = nn.CrossEntropyLoss()   #Loss func
dataset = torch.utils.data.TensorDataset(x, y)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64)
#dataloader = torch.utils.data.DataLoader(dataset, batch_size=128)

dataset_val = torch.utils.data.TensorDataset(x_val, y_val)
#dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=64, shuffle=True)
dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=512)

losses=[]
losses2=[]
epoches = []
for epoch in range(100):
  epoches.append(epoch)
  batch_losses = []
  sum = 0
  sum1 = 0
  for x_batch, y_batch in dataloader:
    y_pred = model(x_batch)
    y_pred2 = []
    loss = loss_func(y_pred, y_batch)
    sum = sum + loss.item()
    batch_losses.append(loss.item())
    #Delete previously stored gradients
    optimizer.zero_grad()
    #Perform backpropagation starting from the loss calculated in this epoch
    loss.backward()
    #Update model's weights based on the gradients calculated during backprop
    optimizer.step()
  x1 = torch.tensor(X_val, dtype=torch.float)
  Y_predict = model(x1)
  loss = loss_func(Y_predict, y_val)
  sum1 = sum1 + loss.item()
  batch_losses.append(loss.item()) 
  Y_predict = nnf.softmax(Y_predict, dim=1)
  y_pred2 = []
  a = sum/len(dataloader)   #training loss
  losses.append(a)
  a = sum1/len(dataloader_val) #validation loss
  losses2.append(a)
#plot loss vs epoch
plt.plot(epoches, losses, 'r')
plt.plot(epoches, losses, 'g')
plt.title("TRAINING LOSS")
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show()

plt.plot(epoches, losses2, 'r')
plt.plot(epoches, losses2, 'g')
plt.title("VALIDATION LOSS")
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show()

#find f1 score
x = torch.tensor(X_val, dtype=torch.float)
Y_predict = model(x) 
Y_predict = nnf.softmax(Y_predict, dim=1)
y_pred2 = []
for tweet in Y_predict:
    if max(tweet) == tweet[0]:
       y_pred2.append(0)
    elif max(tweet) == tweet[1]:
       y_pred2.append(1)
    elif max(tweet) == tweet[2]:
       y_pred2.append(2)

precision = precision_score(Y_val, y_pred2, average='weighted')
recall = recall_score(Y_val, y_pred2, average='weighted')
f1 = 2*(precision * recall) / (precision + recall)
print(f1)
#Plot ROC curve
y = label_binarize(Y_val, classes=[0, 1, 2])
n_classes = 3
lw=2
fpr = dict()
tpr = dict()
roc_auc = dict()
Y_predict = Y_predict.detach().numpy()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y[:, i], Y_predict[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
colors = cycle(['blue', 'red', 'green'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for multi-class data')
plt.legend(loc="lower right")
plt.show()