# Preliminaries

In [72]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy # install command: pip install spacy
import string
import nltk # install command: pip install nltk
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
import onnxruntime # install command: pip install onnxruntime
import torch.onnx as onnx

In [73]:
# !python3 -m spacy download en_core_web_sm

In [74]:
# nltk.download('wordnet')

In [75]:
# nltk.download('omw-1.4')

In [76]:
config = {
    'max_features': 1000,
    'num_epochs':200,
    'learning_rate':1e-1,
    'batch_size':32,
    'train_percentage':90
}

# Dataset

Note that the dataset is taken from https://www.kaggle.com/datasets/yasserh/twitter-tweets-sentiment-dataset

## Exploration

In [78]:
df = pd.read_csv('tweets.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [79]:
df['sentiment'].unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [80]:
df = df.loc[df['sentiment']!='neutral']

In [81]:
df = df[['text','sentiment']]
df.head()

Unnamed: 0,text,sentiment
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
6,2am feedings for the baby are fun when he is a...,positive


In [82]:
df['text'].count()

16363

## Preprocessing

### Lemmatization

In [83]:
lemmatizer = WordNetLemmatizer()

In [84]:
lemmatizer.lemmatize("settings")

'setting'

### Stopword Removal

In [85]:
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words

In [86]:
type(stopwords)

set

In [87]:
len(stopwords)

326

In [88]:
list(stopwords)[:10]

['elsewhere',
 'sixty',
 'by',
 'toward',
 'cannot',
 'perhaps',
 'over',
 'down',
 'any',
 'thence']

### Preprocessing Pipeline

In [89]:
def preprocess(txt):
    txt = txt.lower()
    tokens = txt.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    txt = ' '.join(tokens)
    txt = txt.translate(str.maketrans('', '', string.punctuation))
    tokens = txt.split()
    tokens = [token for token in tokens if token not in stopwords]
    txt = ' '.join(tokens)
    txt = re.sub(r'[0-9]+', '', txt)
    return txt

In [91]:
df = df.reset_index() # restore the indices of the dataframe so that it starts from 0 and skips nothing

In [92]:
original_txt = df['text'][50]
processed_txt = preprocess(df['text'][50])
print(f'The original text was:\n{original_txt}\n The preprocessed text is: \n{processed_txt}')

The original text was:
 Well what im working on isn`t QUITE ready to post about publicly (still beta testing) but its a cool new script I coded
 The preprocessed text is: 
im working isnt ready post publicly beta testing cool new script coded


In [93]:
df['preprocessed_text'] = df['text'].apply(lambda x: preprocess(str(x)))
df.head()

Unnamed: 0,index,text,sentiment,preprocessed_text
0,1,Sooo SAD I will miss you here in San Diego!!!,negative,sooo sad miss san diego
1,2,my boss is bullying me...,negative,bos bullying
2,3,what interview! leave me alone,negative,interview leave
3,4,"Sons of ****, why couldn`t they put them on t...",negative,son couldnt release bought
4,6,2am feedings for the baby are fun when he is a...,positive,am feeding baby fun smile coo


In [94]:
texts = df['preprocessed_text'].to_list()

In [95]:
# vectorizer = CountVectorizer(max_features=config['max_features'])
# features = vectorizer.fit_transform(texts)
vectorizer = TfidfVectorizer(max_features=config['max_features'])
features = vectorizer.fit_transform(texts)

In [96]:
vectorizer.get_feature_names_out()

array(['able', 'absolutely', 'account', 'ache', 'actually', 'add',
       'afford', 'afraid', 'afternoon', 'age', 'ago', 'agree', 'ah',
       'ahh', 'ahhh', 'aint', 'air', 'album', 'alot', 'alright', 'am',
       'amazing', 'america', 'american', 'annoying', 'answer', 'anymore',
       'anyways', 'apart', 'app', 'apparently', 'apple', 'appreciate',
       'arent', 'arm', 'art', 'ask', 'asking', 'asleep', 'ate', 'aw',
       'awake', 'away', 'awesome', 'awful', 'aww', 'awww', 'awwww',
       'babe', 'baby', 'bad', 'bag', 'ball', 'band', 'bank', 'bar',
       'barely', 'bbq', 'bc', 'bday', 'beach', 'beat', 'beautiful', 'bed',
       'beer', 'believe', 'ben', 'best', 'bet', 'better', 'bgt', 'big',
       'bike', 'bird', 'birthday', 'bit', 'black', 'blackberry', 'blah',
       'blast', 'bless', 'blessed', 'block', 'blog', 'blood', 'bloody',
       'blue', 'body', 'boo', 'book', 'bored', 'boring', 'bought', 'bout',
       'boy', 'boyfriend', 'break', 'breakfast', 'breaking', 'bring',
     

In [97]:
np_features = features.toarray()

In [98]:
np_features.shape

(16363, 1000)

We should also work with the labels...

In [99]:
df['num_sentiment'] = df['sentiment'].apply(lambda x: 0 if x == 'negative' else 1)
df.head()

Unnamed: 0,index,text,sentiment,preprocessed_text,num_sentiment
0,1,Sooo SAD I will miss you here in San Diego!!!,negative,sooo sad miss san diego,0
1,2,my boss is bullying me...,negative,bos bullying,0
2,3,what interview! leave me alone,negative,interview leave,0
3,4,"Sons of ****, why couldn`t they put them on t...",negative,son couldnt release bought,0
4,6,2am feedings for the baby are fun when he is a...,positive,am feeding baby fun smile coo,1


In [100]:
labels = df['num_sentiment'].to_list()

In [101]:
labels[:5]

[0, 0, 0, 0, 1]

## Train / Test / Dev Split

In [102]:
f_train, f_rem, l_train, l_rem = train_test_split(np_features, labels, test_size=1-config['train_percentage']/100, random_state=50)
f_test, f_dev, l_test, l_dev = train_test_split(f_rem, l_rem, test_size=0.5, random_state=50)

In [103]:
print(f'train features: {f_train.shape}, dev features: {f_dev.shape}, test features: {f_test.shape}')

train features: (14726, 1000), dev features: (819, 1000), test features: (818, 1000)


In [104]:
print(f'train labels: {len(l_train)}, dev labels: {len(l_dev)}, test labels: {len(l_test)}')

train labels: 14726, dev labels: 819, test labels: 818


## Converting Everything to Tensors

The numpy array we defined above should be converted to a tensor. This tensor will be used in a "Dataset" object.

In [105]:
class MyVectorDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = np.array(labels).reshape(-1, 1)
    def __len__(self):
        return self.features.shape[0]
    def __getitem__(self, idx):
        return torch.Tensor(self.features[idx]), torch.Tensor(self.labels[idx])

In [106]:
train_dataset = MyVectorDataset(f_train, l_train)
test_dataset = MyVectorDataset(f_test, l_test)
dev_dataset = MyVectorDataset(f_dev, l_dev)

In [107]:
train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=config['batch_size'], shuffle=True)

# Neural Net Architecture

In [108]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [109]:
class my_neural_net(torch.nn.Module):
    def __init__(self):
        super(my_neural_net, self).__init__() 
        self.first_layer = torch.nn.Sequential( 
            nn.Linear(config['max_features'], 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        output = self.first_layer(x)
        return output

In [110]:
simple_nn = my_neural_net()

In [111]:
simple_nn = simple_nn.to(device)

In [112]:
simple_nn(train_dataset[:2][0])

tensor([[0.4972],
        [0.4993]], grad_fn=<SigmoidBackward0>)

In [113]:
simple_nn(train_dataset[:2][0]).shape

torch.Size([2, 1])

# Training

## Binary Cross-Entropy

For each point, the loss is calculated like this: $l_n = -w_n[y_n.\log(\mathrm{pred}_n)+(1-y_n).\log(1-\mathrm{pred}_n)]$ where $w_n$ is a rescaling factor</br>

Assume that $w_n=1$

If $y_n=0$ and $\mathrm{pred}_n=1$, then we'll have $l_n=-w_n(0.log(1)+1.log(0))=-w_n(0.log(1)+1.-\infty)=+\infty$

If $y_n=0$ and $\mathrm{pred}_n=0.1$, then we'll have $l_n=-w_n(0.log(0.1)+1.log(0.9))=-(-0.04)=0.04$

If $y_n=0$ and $\mathrm{pred}_n=0.9$, then we'll have $l_n=-w_n(0.log(0.9)+1.log(0.1))=-(-1)=1$

In [114]:
loss_fn = nn.BCELoss()

## Optimizer

Note that stochastic gradient descent performs a parameter update for each training example $x_i$ and label $y_i$

$\theta = \theta - \eta.\nabla_\theta J(\theta;x_i;y_i)$

In [115]:
optimizer = torch.optim.SGD(simple_nn.parameters(), lr=config['learning_rate'])

In [116]:
def output_to_label(out):
    dist_to_0 = abs(out)
    dist_to_1 = abs(out-1)
    if dist_to_0 <= dist_to_1:
        return 0
    else:
        return 1

In [117]:
def train_loop(dataloader, model, loss_fn, optimizer, epoch_num):
    num_points = len(dataloader.dataset)
    for batch, (features, labels) in enumerate(dataloader):        
        # Compute prediction and loss
        pred = model(features)
        loss = loss_fn(pred, labels)
        
        # Backpropagation
        optimizer.zero_grad() # sets gradients of all model parameters to zero
        loss.backward() # calculate the gradients again
        optimizer.step() # w = w - learning_rate * grad(loss)_with_respect_to_w

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(features)
            print(f"\r Epoch {epoch_num} - loss: {loss:>7f}  [{current:>5d}/{num_points:>5d}]", end=" ")


def test_loop(dataloader, model, loss_fn, epoch_num, name):
    num_points = len(dataloader.dataset)
    sum_test_loss, correct = 0, 0

    with torch.no_grad():
        for batch, (features, labels) in enumerate(dataloader):
            pred = model(features)
            sum_test_loss += loss_fn(pred, labels).item() # add the current loss to the sum of the losses
            # convert the outputs of the model on the current batch to a numpy array
            pred_lst = list(pred.numpy().squeeze())
            pred_lst = [output_to_label(item) for item in pred_lst]
            # convert the original labels corresponding to the current batch to a numpy array
            output_lst = list(labels.numpy().squeeze()) 
            # determine the points for which the model is correctly predicting the label (add a 1 for each)
            match_lst = [1 if p==o else 0 for (p, o) in zip(pred_lst, output_lst)] 
            # count how many points are labeled correctly in this batch and add the number to the overall count of the correct labeled points
            correct += sum(match_lst) 
            
    sum_test_loss /= num_points
    correct /= num_points
    print(f"\r Epoch {epoch_num} - {name} Error: Accuracy: {(100*correct):>0.1f}%, Avg loss: {sum_test_loss:>8f}", end=" ")

In [118]:
for epoch_num in range(1, config['num_epochs']+1):
    train_loop(train_dataloader, simple_nn, loss_fn, optimizer, epoch_num)
    test_loop(dev_dataloader, simple_nn, loss_fn, epoch_num, 'Development/Validation')

 Epoch 200 - Development/Validation Error: Accuracy: 86.1%, Avg loss: 0.010535 

In [119]:
test_loop(test_dataloader, simple_nn, loss_fn, epoch_num, 'Test')

 Epoch 200 - Test Error: Accuracy: 83.7%, Avg loss: 0.011854 

# Saving the Model

In [120]:
torch.save(simple_nn.state_dict(), "neural_net.pth")

# Load the Model

In [121]:
model = my_neural_net()
model.load_state_dict(torch.load("neural_net.pth"))
model.eval() # use this line if you have Dropout and BatchNormalization layers in your model

my_neural_net(
  (first_layer): Sequential(
    (0): Linear(in_features=1000, out_features=1, bias=True)
    (1): Sigmoid()
  )
)

In [122]:
model(test_dataset[:2][0])

tensor([[0.1337],
        [0.9956]], grad_fn=<SigmoidBackward0>)

In [123]:
l_test[:2]

[0, 1]

# The ONNX Format

This format is useful when you want to use your model while coding in Java, Javascript, and C#!

## Save the Model

In [124]:
dummy_input = torch.zeros((1,config['max_features']))

In [125]:
onnx.export(model, dummy_input, 'neural_net.onnx')

## Inference

In [126]:
session = onnxruntime.InferenceSession('neural_net.onnx', None) # None: we want all of the outputs

In [127]:
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name

In [128]:
input_name

'onnx::Gemm_0'

In [129]:
output_name

'4'

In [130]:
result = session.run([output_name], {input_name: test_dataset[0][0].numpy().reshape(1,-1)})

In [131]:
result

[array([[0.1336531]], dtype=float32)]