<a href="https://colab.research.google.com/github/AnzorGozalishvili/active_learning_playground/blob/main/notebooks/regular_sentiment_analysis_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple Sentiment Analysis Pipeline

Here we train simple 2 layer neural network for sentiment analysis. 

- Model: 2 Fully Connected layer NN (PyTorch)
- Dataset: Sentiment Analysis
- Embedding: spacy en_core_web_lg (mean aggregated embeddings of the text)

Install Requirements from [repository](https://github.com/AnzorGozalishvili/active_learning_playground)

In [25]:
!wget https://raw.githubusercontent.com/AnzorGozalishvili/active_learning_playground/main/requirements.txt
!pip install -r requirements.txt
!rm requirements.txt
!pip install spacy-sentence-bert==0.1.2

# Imports

In [26]:
# system
import os
import sys

# data and models
import numpy as np
import pandas as pd
import scipy
import random

# text embeddings
import spacy
import spacy_sentence_bert

# scikit-learn stuff
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score

# PyTorch stuff
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# visualization
import matplotlib.pyplot as plt
from tqdm import tqdm

# dataset retrieval
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

# Set Random Seeds
For reproducibility we set several random seeds which are recommended by PyTorch. ([See here](https://pytorch.org/docs/stable/notes/randomness.html))

In [6]:
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("PyTorch") % 2**32 - 1)

RANDOM_SEED = 42

# Dataset
Let's download dataset from given [url](https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip), then take a look at samples.

## Retrieve dataset

In [7]:
def get_dataset():
    resp = urlopen("https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip")
    zipfile = ZipFile(BytesIO(resp.read()))
    
    lines = list()
    for line in zipfile.open('SMSSpamCollection').readlines():
        lines.append(line.decode('utf-8'))
    
    data = pd.DataFrame(data=lines)
    new = data[0].str.split("\t", n = 1, expand = True) 
    data["text"]= new[1] 
    data["label"]= new[0] 
    data.drop(columns=[0], inplace = True)
    
    return data

In [8]:
dataset = get_dataset()

## Explore Samples

In [9]:
dataset.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...\n,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [10]:
dataset.shape

(5574, 2)

## Generate Train/Test splits and move forward

We see the imbalance in target variable

In [11]:
dataset.label.value_counts()

ham     4827
spam     747
Name: label, dtype: int64

We have duplicated records

In [12]:
dataset.duplicated().sum()

403

remove these duplicates

In [13]:
dataset.drop_duplicates(inplace=True)

In [14]:
dataset.reset_index(drop=True, inplace=True)

split into train/test splits with 20/80 ratio

In [15]:
train, test = train_test_split(dataset, test_size=0.2, random_state=RANDOM_SEED)

Store these sets into dataset directory

In [17]:
DATASET_NAME = "SMSSpamCollection"
if not os.path.exists('data'):
    os.mkdir('data')
    if not os.path.exists(f'data/{DATASET_NAME}'):
        os.mkdir(f'data/{DATASET_NAME}')
        
        train.to_csv(f'data/{DATASET_NAME}/train.csv')
        test.to_csv(f'data/{DATASET_NAME}/test.csv')

Load again and continue

In [18]:
train = pd.read_csv(f'data/{DATASET_NAME}/train.csv', index_col=0)
test = pd.read_csv(f'data/{DATASET_NAME}/test.csv', index_col=0)

In [19]:
train.shape, test.shape

((4136, 2), (1035, 2))

In [21]:
train.head(2)

Unnamed: 0,text,label
5132,What about this one then.\n,ham
2067,I will once i get home\n,ham


# Generate Embeddings

We use spacy embeddings to vectorize our samples

In [38]:
class Vectorizer:
    """Generates text embedding using deep learning model"""

    def __init__(self, *args, **kwargs):
        self.model = spacy_sentence_bert.load_model(kwargs.get('model', 'en_paraphrase_distilroberta_base_v1'))
    
    def __call__(self, text):
        if not text:
            text = ""
        
        return self.model(text).vector

In [39]:
vectorizer = Vectorizer()

In [53]:
EMBEDDING_DIM = vectorizer('sample text for embedding').shape[0]; 
EMBEDDING_DIM

768

In [42]:
train['vector'] = train.text.apply(vectorizer)
test['vector'] = test.text.apply(vectorizer)

# PyTorch ML Pipeline

## Model

Example of model is taken from [here](https://github.com/rmunro/pytorch_active_learning/blob/master/active_learning_basics.py)

In [122]:
class MLP(nn.Module):
    """Simple 2 Layer Fully Connected NN (MLP)"""
    
    def __init__(self, num_labels, emb_dim):
        super(MLP, self).__init__()

        # Define model with one hidden layer with 128 neurons
        self.linear1 = nn.Linear(emb_dim, 128)
        self.linear2 = nn.Linear(128, num_labels)

    def forward(self, vector):
        hidden1 = self.linear1(vector).clamp(min=0) # ReLU
        output = self.linear2(hidden1)
        return F.log_softmax(output, dim=1)

In [123]:
MLP(num_labels=2, emb_dim=EMBEDDING_DIM)

MLP(
  (linear1): Linear(in_features=768, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=2, bias=True)
)

In [124]:
class Trainer:
    """Trains PyTorch model on training data and also evaluated"""

    def __init__(self, *args, **kwargs):
        self.model = kwargs.get('model', MLP(num_labels=2, emb_dim=EMBEDDING_DIM))
        self.loss_function = kwargs.get('loss_function', nn.NLLLoss())
        self.optimizer = kwargs.get('optimizer', optim.SGD(self.model.parameters(), lr=0.01))
        self.label_to_idx = kwargs.get('label_to_idx', {'ham': 0, 'spam': 1})

    def train(self, training_data, test_data, epochs):
        
        for epoch in range(epochs):
            print(f'Epoch: {str(epoch)}')

            for idx, row in training_data.sample(frac=1.0).iterrows():
                vec = torch.Tensor(row.vector).view(1, -1)
                target = torch.LongTensor([self.label_to_idx[row.label]]).view(1, -1)

                self.model.zero_grad()
                log_probs = self.model(vec)

                loss = self.loss_function(log_probs, target)
                loss.backward()
                self.optimizer.step()

        # fscore, auc = evaluate_model(model, evaluation_data)
        fscore, auc = 0.82323, 0.7544343
        fscore = round(fscore,3)
        auc = round(auc,3)


        # save model to path that is alphanumeric and includes number of items and accuracies in filename
        timestamp = re.sub('\.[0-9]*','_',str(datetime.datetime.now())).replace(" ", "_").replace("-", "").replace(":","")
        accuracies = str(fscore)+"_"+str(auc)              
        model_path = "models/"+timestamp+accuracies+".params"

        torch.save(self.model.state_dict(), model_path)

        return model_path


    def evaluate(self, ):
        logits = self.model()
        preds = model.predict(dataset.text)
        probas = model.predict_proba(dataset.text)[:, 1]
        
        results = {
            "f1": f1_score(dataset.label, preds, pos_label=model.classes_[-1]),
            "precision": precision_score(dataset.label, preds, pos_label=model.classes_[-1]),
            "recall": recall_score(dataset.label, preds, pos_label=model.classes_[-1]),
            "roc_auc": roc_auc_score(dataset.label, probas, labels=model.classes_)
        }
        
        return results

In [125]:


# def evaluate_model(model, evaluation_data):
#     """Evaluate the model on the held-out evaluation data

#     Return the f-value for disaster-related and the AUC
#     """

#     related_confs = [] # related items and their confidence of being related
#     not_related_confs = [] # not related items and their confidence of being _related_

#     true_pos = 0.0 # true positives, etc 
#     false_pos = 0.0
#     false_neg = 0.0

#     with torch.no_grad():
#         for item in evaluation_data:
#             _, text, label, _, _, = item

#             feature_vector = make_feature_vector(text.split(), feature_index)
#             log_probs = model(feature_vector)

#             # get confidence that item is disaster-related
#             prob_related = math.exp(log_probs.data.tolist()[0][1]) 

#             if(label == "1"):
#                 # true label is disaster related
#                 related_confs.append(prob_related)
#                 if prob_related > 0.5:
#                     true_pos += 1.0
#                 else:
#                     false_neg += 1.0
#             else:
#                 # not disaster-related
#                 not_related_confs.append(prob_related)
#                 if prob_related > 0.5:
#                     false_pos += 1.0

#     # Get FScore
#     if true_pos == 0.0:
#         fscore = 0.0
#     else:
#         precision = true_pos / (true_pos + false_pos)
#         recall = true_pos / (true_pos + false_neg)
#         fscore = (2 * precision * recall) / (precision + recall)

#     # GET AUC
#     not_related_confs.sort()
#     total_greater = 0 # count of how many total have higher confidence
#     for conf in related_confs:
#         for conf2 in not_related_confs:
#             if conf < conf2:
#                 break
#             else:                  
#                 total_greater += 1


#     denom = len(not_related_confs) * len(related_confs) 
#     auc = total_greater / denom

#     return[fscore, auc]



In [126]:
LABEL_TO_IDX = {item:idx for idx, item in enumerate(sorted(train.label.unique().tolist()))};
LABEL_TO_IDX

{'ham': 0, 'spam': 1}

In [127]:
mlp = MLP(num_labels=2, emb_dim=EMBEDDING_DIM)

In [131]:
trainer.model(torch.Tensor(vectorizer('sample text')).view(1, -1))

tensor([[-0.6326, -0.7576]], grad_fn=<LogSoftmaxBackward0>)

In [129]:
trainer = Trainer(
    **{
        "model": mlp,
        "loss_function": nn.NLLLoss(),
        "optimizer": optim.SGD(mlp.parameters(), lr=0.01),
        "label_to_idx": LABEL_TO_IDX
    }
)

In [130]:
trainer.train(training_data=train, test_data=test, epochs=10)

Epoch: 0


RuntimeError: ignored