# Assignment Two:  Sentiment Classification

For this exercise you will be using the "SemEval 2017 task 4" corpus provided on the module website, available through the following link: https://warwick.ac.uk/fac/sci/dcs/teaching/material/cs918/semeval-tweets.tar.bz2 You will focus particularly on Subtask A, i.e. classifying the overall sentiment of a tweet as positive, negative or neutral.

You are requested to produce a Jupyter notebook for the coursework submission. The input to your program is the SemEval data downloaded. Note that TAs need to run your program on their own machine by using the original SemEval data. As such, don’t submit a Python program that takes as input some preprocessed files.

#### Import necessary packages
You may import more packages here.

In [1]:
# Import necessary packages
import re
from os.path import join
import numpy as np

In [2]:
# Import packages
import numpy as np
import re

import torch
print("My Pytorch version: " + torch.__version__)
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Set the device to perform the computation
DEVICE = torch.device('mps')

# Set a fixed seed for reproducibility
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import torchtext
print("Torch Text Version : {}".format(torchtext.__version__))

import json
import math
import nltk
import os, sys
from nltk.util import ngrams

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV

from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from torch.optim import Adam
from tqdm import tqdm

My Pytorch version: 2.0.0
Torch Text Version : 0.15.1


In [3]:
# Define test sets
testsets = ['twitter-test1.txt', 'twitter-test2.txt', 'twitter-test3.txt']

In [4]:
# Skeleton: Evaluation code for the test sets
def read_test(testset):
    '''
    readin the testset and return a dictionary
    :param testset: str, the file name of the testset to compare
    '''
    id_gts = {}
    with open(testset, 'r', encoding='utf8') as fh:
        for line in fh:
            fields = line.split('\t')
            tweetid = fields[0]
            gt = fields[1]

            id_gts[tweetid] = gt

    return id_gts


def confusion(id_preds, testset, classifier):
    '''
    print the confusion matrix of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    gts = []
    for m, c1 in id_gts.items():
        if c1 not in gts:
            gts.append(c1)

    gts = ['positive', 'negative', 'neutral']

    conf = {}
    for c1 in gts:
        conf[c1] = {}
        for c2 in gts:
            conf[c1][c2] = 0

    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'
        conf[pred][gt] += 1

    print(''.ljust(12) + '  '.join(gts))

    for c1 in gts:
        print(c1.ljust(12), end='')
        for c2 in gts:
            if sum(conf[c1].values()) > 0:
                print('%.3f     ' % (conf[c1][c2] / float(sum(conf[c1].values()))), end='')
            else:
                print('0.000     ', end='')
        print('')

    print('')


def evaluate(id_preds, testset, classifier):
    '''
    print the macro-F1 score of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    acc_by_class = {}
    for gt in ['positive', 'negative', 'neutral']:
        acc_by_class[gt] = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}

    catf1s = {}

    ok = 0
    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'

        if gt == pred:
            ok += 1
            acc_by_class[gt]['tp'] += 1
        else:
            acc_by_class[gt]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    catcount = 0
    itemcount = 0
    macro = {'p': 0, 'r': 0, 'f1': 0}
    micro = {'p': 0, 'r': 0, 'f1': 0}
    semevalmacro = {'p': 0, 'r': 0, 'f1': 0}

    microtp = 0
    microfp = 0
    microtn = 0
    microfn = 0
    for cat, acc in acc_by_class.items():
        catcount += 1

        microtp += acc['tp']
        microfp += acc['fp']
        microtn += acc['tn']
        microfn += acc['fn']

        p = 0
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        catf1s[cat] = f1

        n = acc['tp'] + acc['fn']

        macro['p'] += p
        macro['r'] += r
        macro['f1'] += f1

        if cat in ['positive', 'negative']:
            semevalmacro['p'] += p
            semevalmacro['r'] += r
            semevalmacro['f1'] += f1

        itemcount += n

    micro['p'] = float(microtp) / float(microtp + microfp)
    micro['r'] = float(microtp) / float(microtp + microfn)
    micro['f1'] = 2 * float(micro['p']) * micro['r'] / float(micro['p'] + micro['r'])

    semevalmacrof1 = semevalmacro['f1'] / 2

    print(testset + ' (' + classifier + '): %.3f' % semevalmacrof1)

#### Load training set, dev set and testing set
Here, you need to load the training set, the development set and the test set. For better classification results, you may need to preprocess tweets before sending them to the classifiers.

In [5]:
# Load training set, dev set and testing set
data = {}
tweetids = {}
tweetgts = {}
tweetgts_num = {}
tweets = {}

# not given
# may need to substitute using tweets
tweets_preprocessed = {}

for dataset in ['twitter-training-data.txt','twitter-dev-data.txt'] + testsets:
    data[dataset] = []
    tweets[dataset] = []
    tweetids[dataset] = []
    tweetgts[dataset] = []
    tweetgts_num[dataset] = []
    tweets_preprocessed[dataset] = []

    # write code to read in the datasets here
    with open(dataset) as f:
        for line in f:
            twitter_object = line # load the line, containing a twitter object
            twitter_object = twitter_object.lower() # convert to lowercase
            words_list = twitter_object.split()
            words_req = words_list[2:]
            tweet = ' '.join(words_req)
            ids_reqd=words_list[0]
            sentiment = words_list[1]
            if sentiment == 'positive' or sentiment =='neutral':
                sent_int = 0
            else:
                sent_int = 1
            # list of twitter objects
            data[dataset].append(twitter_object)
            # list of tweets
            tweets[dataset].append(tweet)
            # list of ids
            tweetids[dataset].append(ids_reqd)
            # list of sentiments
            tweetgts[dataset].append(sentiment)
            # list of sentiments num
            tweetgts_num[dataset].append(sent_int)
            
    # Tweet preprocessing
    
    # Apply our RegExp to the example document
    # Remove twitter handles
    new_tweets = []
    for tweet in tweets[dataset]:
        tweet_without_handles = re.sub('@[\w]*', '', tweet)
        new_tweets.append(tweet_without_handles)

    # Remove URLs
    new_tweets1 = []
    for tweet in new_tweets:
        tweet_without_url = re.sub(r"((https?|ftp)://)?[a-z0-9\-._~:/?#\[\]@!$&'()*+,;=%]+\.[a-z]{2,}[a-z0-9\-._~:/?#[\]@!$&'()*+,;=%]*","",tweet)
        new_tweets1.append(tweet_without_url)
    
    # Remove non alpha characters except hashtags
    new_tweets_2=[]
    for tweet in new_tweets1:
        tweet_without_nonalpha = re.sub(r"[^A-Za-z# ]","",tweet)
        new_tweets_2.append(tweet_without_nonalpha)
    
    # Remove numbers
    new_tweets_3=[]
    for tweet in new_tweets_2:
        tweets_without_num = re.sub(r"\b[0-9]+\b","",tweet)
        new_tweets_3.append(tweets_without_num)
    
    # Remove words with three or less characters
    for tweet in new_tweets_3:
        tweet_with_atleast_four_char = ' '.join(word for word in tweet.split() if len(word)>3)
        tweets_preprocessed[dataset].append(tweet_with_atleast_four_char)  


Check how to handle hashtags.

### Preliminary Analysis

Visualise the preprocessed training set in a dataframe as follows:

In [37]:
import pandas as pd
train_col = [tweetids['twitter-training-data.txt'], tweets_preprocessed['twitter-training-data.txt'], tweetgts['twitter-training-data.txt']]
train_df = pd.DataFrame (train_col).transpose()
train_df.columns = ['ids', 'tweets','sentiments']
print(train_df)

                      ids                                             tweets  \
0      335104872099066692  felt privileged play fighters songs guitar tod...   
1      796528524030124618  pakistan islamic country true muslims india lo...   
2      760964834217238632  happy birthday coolest golfer bali become cool...   
3      147713180324524046                       tmills going tucson thursday   
4      732302280474120023  hmmmmm where #blacklivesmatter when matters li...   
...                   ...                                                ...   
45096  660374218263817235  sunday cinema paul mccartney david gilmour pau...   
45097  739323365061217061  independence sacrifices muslims victory pakist...   
45098  681369726697754114  september arrived which means apples iphone ju...   
45099  922217029064536808  tomorrow some filled feeding league prolly som...   
45100  606913141028836185  alright whos choosing paul mccartney over week...   

      sentiments  
0       positive  
1

In [38]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45101 entries, 0 to 45100
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ids         45101 non-null  object
 1   tweets      45101 non-null  object
 2   sentiments  45101 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [40]:
train_df['tweets'].value_counts

<bound method IndexOpsMixin.value_counts of 0        felt privileged play fighters songs guitar tod...
1        pakistan islamic country true muslims india lo...
2        happy birthday coolest golfer bali become cool...
3                             tmills going tucson thursday
4        hmmmmm where #blacklivesmatter when matters li...
                               ...                        
45096    sunday cinema paul mccartney david gilmour pau...
45097    independence sacrifices muslims victory pakist...
45098    september arrived which means apples iphone ju...
45099    tomorrow some filled feeding league prolly som...
45100    alright whos choosing paul mccartney over week...
Name: tweets, Length: 45101, dtype: object>

In [39]:
# check for class imbalance
train_df['sentiments'].value_counts()

neutral     20789
positive    15986
negative     8326
Name: sentiments, dtype: int64

Note that there are very small number of negative tweets compared to neutral and positive. We need to accurately predict negative classes. So we will group the neutral and positive classes together as 0, and the negative labels will be encoded as 1.

In [41]:
# Tokenisation
tokenized_tweet = train_df['tweets'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [felt, privileged, play, fighters, songs, guit...
1    [pakistan, islamic, country, true, muslims, in...
2    [happy, birthday, coolest, golfer, bali, becom...
3                    [tmills, going, tucson, thursday]
4    [hmmmmm, where, #blacklivesmatter, when, matte...
Name: tweets, dtype: object

In [42]:
# stemming
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

0    [felt, privileg, play, fighter, song, guitar, ...
1    [pakistan, islam, countri, true, muslim, india...
2    [happi, birthday, coolest, golfer, bali, becom...
3                        [tmill, go, tucson, thursday]
4    [hmmmmm, where, #blacklivesmatt, when, matter,...
Name: tweets, dtype: object

In [43]:
# stitch these tokens back together
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

train_df['tidy_tweet'] = tokenized_tweet
train_df

Unnamed: 0,ids,tweets,sentiments,tidy_tweet
0,335104872099066692,felt privileged play fighters songs guitar tod...,positive,felt privileg play fighter song guitar today w...
1,796528524030124618,pakistan islamic country true muslims india lo...,positive,pakistan islam countri true muslim india love ...
2,760964834217238632,happy birthday coolest golfer bali become cool...,positive,happi birthday coolest golfer bali becom coole...
3,147713180324524046,tmills going tucson thursday,negative,tmill go tucson thursday
4,732302280474120023,hmmmmm where #blacklivesmatter when matters li...,negative,hmmmmm where #blacklivesmatt when matter like ...
...,...,...,...,...
45096,660374218263817235,sunday cinema paul mccartney david gilmour pau...,neutral,sunday cinema paul mccartney david gilmour pau...
45097,739323365061217061,independence sacrifices muslims victory pakist...,neutral,independ sacrific muslim victori pakistan proud
45098,681369726697754114,september arrived which means apples iphone ju...,positive,septemb arriv which mean appl iphon just hour ...
45099,922217029064536808,tomorrow some filled feeding league prolly som...,positive,tomorrow some fill feed leagu prolli some skul...


Stemming is not required as it creates some meaningless tokens and changes the meaning for some other tokens.

### Traditional ML models

In [6]:
# training + validation
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

classifier = 'Logistic Regression'
testset = 'twitter-dev-data.txt'

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')

document = []
for dataset in ['twitter-training-data.txt', testset] :
    document += tweets_preprocessed[dataset]

# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(document)

# labels
ytrain = tweetgts['twitter-training-data.txt']
ytest = tweetgts[testset]

train_tfidf = tfidf[:45101,:]
test_tfidf = tfidf[45101:,:]

xtrain_tfidf = train_tfidf[range(len(ytrain))]
xtest_tfidf = test_tfidf[range(len(ytest))]

if classifier == 'Logistic Regression':
    lreg = LogisticRegression()
    lreg.fit(xtrain_tfidf, ytrain)

    preds = lreg.predict(xtest_tfidf)
    pred_list = list(preds)
    
elif classifier == 'SVM':
    svc = svm.SVC(kernel='linear', C=1).fit(xtrain_tfidf, ytrain)

    preds = svc.predict(xtest_tfidf)
    pred_list = list(preds)
    
elif classifier == 'Random Forest':
    rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_tfidf, ytrain)
    preds = rf.predict(xtest_tfidf)
    pred_list = list(preds)
    
elif classifier == 'Decision Tree':
    model_dec = DecisionTreeClassifier(max_depth=10, random_state=0)
    model_dec.fit(xtrain_tfidf, y_train)
    preds = model_dec.predict(xtest_tfidf)
    pred_list = list(preds)
    
for i in range(len(pred_list)):
    if pred_list[i]==0:
        pred_list[i]='positive'
    else:
        pred_list[i]='negative'
        
id_preds = dict(zip(tweetids[testset], pred_list))

evaluate(id_preds, testset, classifier)

twitter-dev-data.txt (Logistic Regression): 0.159


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Use gridsearch to perform hyperparameter optimisation
# from sklearn.model_selection import GridSearchCV

# parameters = {'kernel': ('linear', 'rbf'), 'C': (0.1,1,10,100,1000)}

# svc = svm.SVC()
# clf = GridSearchCV(svc, parameters, cv=10, n_jobs=-1) ## `-1` run in parallel
# clf.fit(xtrain_tfidf, ytrain)

### Encoding the datasets

In [8]:
# BOW
# Use the torchtext tokenizer, it builds a vocabulary on the training and test set
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

def build_vocab(datasets):
    for dataset in datasets:
        for text in dataset:
            yield tokenizer(text)
# in tokens: the words can repeat.
# in the dictionary, there are unique words
# vocab: unique list of words we are going to use
# build the dictionary
# specials token: for all the words that are not in the dictionary
vocab = build_vocab_from_iterator(build_vocab([tweets_preprocessed['twitter-training-data.txt'], tweets_preprocessed['twitter-dev-data.txt']]), specials=["<UNK>"])

# Defaults symbol for unknown words 
vocab.set_default_index(vocab["<UNK>"])

#creating vocabulary
voc = [w for w in vocab.get_stoi()]
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in voc:
    vocab2index[word] = len(words)
    words.append(word)
    
def encode_sentence(text, vocab2index, N=30):
    tokenized = tokenizer(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [9]:
# encode testsets 1,2,3,4,5
tweets_encoded = {}
for dataset in ['twitter-training-data.txt','twitter-dev-data.txt']+testsets:
    tweets_encoded[dataset] = []
    for preprocessed_tweet in tweets_preprocessed[dataset]:
        encoded_tweet = encode_sentence(preprocessed_tweet, vocab2index, N=30)
        tweets_encoded[dataset].append(encoded_tweet)

In [10]:
# create train and val sets
X_train = tweets_encoded['twitter-training-data.txt']
X_valid = tweets_encoded['twitter-dev-data.txt']
y_train = tweetgts_num['twitter-training-data.txt']
y_valid = tweetgts_num['twitter-dev-data.txt']

In [11]:
# make a dictionary of testset sentiments
id_gts = {}
for testset in testsets:
    id_gts[testset] = read_test(testset)
    for id in id_gts[testset]:
        if id_gts[testset][id] == 'positive' or id_gts[testset][id]=='neutral':
            id_gts[testset][id] = 0
        else:
            id_gts[testset][id] = 1

In [12]:
# create test sets
X_test_1 = tweets_encoded['twitter-test1.txt']
X_test_2 = tweets_encoded['twitter-test2.txt']
X_test_3 = tweets_encoded['twitter-test3.txt']
y_test_1 = list(id_gts['twitter-test1.txt'].values())
y_test_2 = list(id_gts['twitter-test2.txt'].values())
y_test_3 = list(id_gts['twitter-test3.txt'].values())

## Neural Network

In [13]:
# BOW
# Use the torchtext tokenizer, it builds a vocabulary on the training and test set
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

def build_vocab(datasets):
    for dataset in datasets:
        for text in dataset:
            yield tokenizer(text)
# in tokens: the words can repeat.
# in the dictionary, there are unique words
# vocab: unique list of words we are going to use
# build the dictionary
# specials token: for all the words that are not in the dictionary
vocab = build_vocab_from_iterator(build_vocab([tweets_preprocessed['twitter-training-data.txt'], tweets_preprocessed['twitter-dev-data.txt']]), specials=["<UNK>"])

# Defaults symbol for unknown words 
vocab.set_default_index(vocab["<UNK>"])

#creating vocabulary
voc = [w for w in vocab.get_stoi()]
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in voc:
    vocab2index[word] = len(words)
    words.append(word)

In [14]:
# Create Pytorch dataset
from torch.utils.data import Dataset
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx]
    
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)

batch_size = 2000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [15]:
batch_size = 2000
vocab_size = len(words)
X = {}
Y = {}
ds = {}
dl = {}
for dataset in ['twitter-training-data.txt','twitter-dev-data.txt']+testsets:
    X[dataset] = tweets_encoded[dataset]
    Y[dataset] = tweetgts_num[dataset]
    ds[dataset] = ReviewsDataset(X[dataset], Y[dataset])
    dl[dataset] = DataLoader(ds[dataset], batch_size=batch_size)

In [16]:
# Example - 1 batch
for X, Y in dl['twitter-dev-data.txt']:
    print(X.shape, Y.shape)
    break

torch.Size([2000, 30]) torch.Size([2000])


In [17]:
# GLOVE
def load_glove_vectors(glove_file="glove.6B.100d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors

def get_emb_matrix(pretrained, word_counts, emb_size = 100):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

voc = [w for w in vocab.get_stoi()]
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, voc)

In [18]:
def TrainingLoop(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        losses = []
        # Cycle over the training examples (using minibatches)
        # X are the examples, Y are the associated labels
        for X, Y in tqdm(train_loader):
            # Make the prediction
            Y_preds = model(X)

            # Compute the loss
            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            # Reset the gradient
            optimizer.zero_grad()
            
            # Compute the gradient
            loss.backward()
            
            # Update the weights
            optimizer.step()

In [19]:
from torch.optim import Adam
 
learning_rate = 1e-3

# Loss Function
loss_fn = nn.CrossEntropyLoss()

# Optimizer
#optimizer = optim.SGD(text_classifier.parameters(), lr=0.01, momentum=0.9)

In [20]:
def Prediction(model, loss_fn, dataset):
    # -- Disable the gradient --
    # saves computations
    with torch.no_grad():
        Y_shuffled, Y_preds, losses = [],[],[]
        for X, Y in dl[dataset]:
            preds = model(X)
            Y_preds.append(preds.argmax(dim=-1))
            Y_shuffled.append(Y)

        Y_shuffled = torch.cat(Y_shuffled)
        Y_preds    = torch.cat(Y_preds)

        return Y_shuffled, Y_preds

## LSTM

In [21]:
setup_seed(42)

class LSTM_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 2)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])
    
lstm_model = LSTM_glove_vecs(vocab_size, 100, 100, pretrained_weights)

In [22]:
optimizer = Adam(lstm_model.parameters(), lr=learning_rate)
TrainingLoop(lstm_model, loss_fn, optimizer, dl['twitter-training-data.txt'], dl['twitter-dev-data.txt'])

100%|███████████████████████████████████████████| 23/23 [00:07<00:00,  2.94it/s]
100%|███████████████████████████████████████████| 23/23 [00:07<00:00,  2.94it/s]
100%|███████████████████████████████████████████| 23/23 [00:07<00:00,  2.98it/s]
100%|███████████████████████████████████████████| 23/23 [00:07<00:00,  2.95it/s]
100%|███████████████████████████████████████████| 23/23 [00:07<00:00,  2.95it/s]
100%|███████████████████████████████████████████| 23/23 [00:07<00:00,  2.97it/s]
100%|███████████████████████████████████████████| 23/23 [00:07<00:00,  2.93it/s]
100%|███████████████████████████████████████████| 23/23 [00:07<00:00,  2.99it/s]
100%|███████████████████████████████████████████| 23/23 [00:07<00:00,  2.97it/s]
100%|███████████████████████████████████████████| 23/23 [00:07<00:00,  2.97it/s]


In [23]:
classifier = 'LSTM'
pred_list = Prediction(lstm_model, loss_fn, 'twitter-dev-data.txt')[1].tolist()
for i in range(len(pred_list)):
    if pred_list[i] == 0:
        pred_list[i] ='positive'
    else:
        pred_list[i] ='negative'
        
id_preds = dict(zip(tweetids['twitter-dev-data.txt'], pred_list))
evaluate(id_preds, 'twitter-dev-data.txt', classifier)

twitter-dev-data.txt (LSTM): 0.530


# Bi_LSTM

In [24]:
setup_seed(42)

# basic lstm with glove embedding
class Bi_LSTM(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True,bidirectional = True)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(hidden_dim, 100)
        self.relu = nn.ReLU()
        self.norm1 = nn.BatchNorm1d(100)
        
        self.fc2 = nn.Linear(100,50)
        self.relu = nn.ReLU()
        self.norm2 = nn.BatchNorm1d(50)
        self.fc3 = nn.Linear(50,25)
        self.relu = nn.ReLU()
        self.norm3 = nn.BatchNorm1d(25)
        self.fc4 = nn.Linear(25,2)
        self.sig = nn.Sigmoid() # reduces performance
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm1_out, (ht1, ct1) = self.lstm1(x)
        out = self.dropout(ht1[-1])
        out = self.fc1(out)
        out = self.relu(out)
        out = self.norm1(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.norm2(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.norm3(out)
        out = self.fc4(out)
        out = self.sig(out)
        return out
        
    
bi_lstm_model = Bi_LSTM(vocab_size, 100, 300, pretrained_weights)

In [25]:
optimizer = Adam(bi_lstm_model.parameters(), lr=learning_rate)
TrainingLoop(bi_lstm_model, loss_fn, optimizer, dl['twitter-training-data.txt'], dl['twitter-dev-data.txt'])

100%|███████████████████████████████████████████| 23/23 [01:10<00:00,  3.07s/it]
100%|███████████████████████████████████████████| 23/23 [01:09<00:00,  3.04s/it]
100%|███████████████████████████████████████████| 23/23 [01:09<00:00,  3.02s/it]
100%|███████████████████████████████████████████| 23/23 [01:08<00:00,  3.00s/it]
100%|███████████████████████████████████████████| 23/23 [01:08<00:00,  2.99s/it]
100%|███████████████████████████████████████████| 23/23 [01:09<00:00,  3.01s/it]
100%|███████████████████████████████████████████| 23/23 [01:09<00:00,  3.00s/it]
100%|███████████████████████████████████████████| 23/23 [01:09<00:00,  3.00s/it]
100%|███████████████████████████████████████████| 23/23 [01:08<00:00,  2.99s/it]
100%|███████████████████████████████████████████| 23/23 [01:08<00:00,  3.00s/it]


In [26]:
classifier = 'Bi_LSTM'
pred_list = Prediction(bi_lstm_model, loss_fn, 'twitter-dev-data.txt')[1].tolist()
for i in range(len(pred_list)):
    if pred_list[i] == 0:
        pred_list[i] ='positive'
    else:
        pred_list[i] ='negative'
        
id_preds = dict(zip(tweetids['twitter-dev-data.txt'], pred_list))
evaluate(id_preds, 'twitter-dev-data.txt', classifier)

twitter-dev-data.txt (Bi_LSTM): 0.507


## Bi_LSTM_GRU

In [27]:
setup_seed(42)

class Bi_LSTM_GRU(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True,bidirectional = True)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(hidden_dim, 100)
        self.relu = nn.ReLU()
        self.norm1 = nn.BatchNorm1d(100)
        self.gru = nn.GRU(100,75,batch_first=True)
        self.fc2 = nn.Linear(75,50)
        self.relu = nn.ReLU()
        self.norm2 = nn.BatchNorm1d(50)
        self.fc3 = nn.Linear(50,25)
        self.relu = nn.ReLU()
        self.norm3 = nn.BatchNorm1d(25)
        self.fc4 = nn.Linear(25,2)
        self.sig = nn.Sigmoid() # reduces performance
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm1_out, (ht1, ct1) = self.lstm1(x)
        out = self.fc1(ht1[-1])
        out = self.relu(out)
        out = self.norm1(out)
        out,_ = self.gru(out) #
        out = out.reshape(out.shape[0], -1) #
        out = self.fc2(out)
        out = self.relu(out)
        out = self.norm2(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.norm3(out)
        out = self.fc4(out)
        out = self.sig(out)
        return out
bi_lstm_gru_model = Bi_LSTM_GRU(vocab_size, 100, 300, pretrained_weights)

In [33]:
optimizer = Adam(bi_lstm_gru_model.parameters(), lr=learning_rate)
TrainingLoop(bi_lstm_gru_model, loss_fn, optimizer, dl['twitter-training-data.txt'], dl['twitter-dev-data.txt'])

100%|███████████████████████████████████████████| 23/23 [01:14<00:00,  3.24s/it]
100%|███████████████████████████████████████████| 23/23 [01:14<00:00,  3.24s/it]
100%|███████████████████████████████████████████| 23/23 [01:14<00:00,  3.24s/it]
100%|███████████████████████████████████████████| 23/23 [01:14<00:00,  3.24s/it]
100%|███████████████████████████████████████████| 23/23 [01:14<00:00,  3.24s/it]
100%|███████████████████████████████████████████| 23/23 [01:14<00:00,  3.25s/it]
100%|███████████████████████████████████████████| 23/23 [01:14<00:00,  3.25s/it]
100%|███████████████████████████████████████████| 23/23 [01:14<00:00,  3.24s/it]
100%|███████████████████████████████████████████| 23/23 [06:18<00:00, 16.45s/it]
100%|███████████████████████████████████████████| 23/23 [01:14<00:00,  3.24s/it]


In [34]:
classifier = 'Bi_LSTM_GRU'
pred_list = Prediction(bi_lstm_gru_model, loss_fn, 'twitter-dev-data.txt')[1].tolist()
for i in range(len(pred_list)):
    if pred_list[i] == 0:
        pred_list[i] ='positive'
    else:
        pred_list[i] ='negative'
        
id_preds = dict(zip(tweetids['twitter-dev-data.txt'], pred_list))
evaluate(id_preds, 'twitter-dev-data.txt', classifier)

twitter-dev-data.txt (Bi_LSTM_GRU): 0.520


#### Build sentiment classifiers
You need to create your own classifiers (at least 3 classifiers). For each classifier, you can choose between the bag-of-word features and the word-embedding-based features. Each classifier has to be evaluated over 3 test sets. Make sure your classifier produce consistent performance across the test sets. Marking will be based on the performance over all 5 test sets (2 of them are not provided to you).

In [31]:
classifier = 'LSTM'
model = bi_lstm_gru_model
testset = 'twitter-dev-data.txt'
pred_list = Prediction(model, loss_fn, testset)[1].tolist()
for i in range(len(pred_list)):
    if pred_list[i] == 0:
        pred_list[i] ='positive'
    else:
        pred_list[i] ='negative'
        
        
id_preds = dict(zip(tweetids[testset], pred_list))
evaluate(id_preds, testset, classifier)

twitter-dev-data.txt (LSTM): 0.327


In [35]:
# Buid traditional sentiment classifiers. An example classifier name 'svm' is given
# in the code below. You should replace the other two classifier names
# with your own choices. For features used for classifier training, 
# the 'bow' feature is given in the code. But you could also explore the 
# use of other features.
for classifier in ['Logistic Regression','Decision Tree','Naive Bayes','LSTM','Bi_LSTM','Bi_LSTM_GRU']:
    for features in ['bow']:
        # Skeleton: Creation and training of the classifiers
        #if classifier == 'svm':
            # write the svm classifier here
            #model = svm.SVC(kernel='linear', C=1)
            #print('Training ' + classifier)
        if classifier == 'Logistic Regression':
            # write the classifier 2 here
            model = LogisticRegression()
            print('Training ' + classifier)
        elif classifier == 'Random Forest':
            # write the classifier 3 here
            model = RandomForestClassifier(n_estimators=400, random_state=11)
            print('Training ' + classifier)
        elif classifier == 'Decision Tree':
            model = DecisionTreeClassifier(max_depth=10, random_state=0)
            print('Training ' + classifier)
        elif classifier == 'Naive Bayes':
            model = GaussianNB()
        elif classifier == 'LSTM':
            # write the LSTM classifier here
            class LSTM_glove_vecs(torch.nn.Module) :
                def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
                    super().__init__()
                    self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
                    self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
                    self.embeddings.weight.requires_grad = False ## freeze embeddings
                    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
                    self.linear = nn.Linear(hidden_dim, 2)
                    self.dropout = nn.Dropout(0.2)
        
                def forward(self, x):
                    x = self.embeddings(x)
                    x = self.dropout(x)
                    lstm_out, (ht, ct) = self.lstm(x)
                    return self.linear(ht[-1])
            # model = LSTM_glove_vecs(vocab_size, 100, 100, pretrained_weights)
            # learning_rate = 1e-3
            # loss_fn = nn.CrossEntropyLoss() # Loss Function
            #optimizer = optim.SGD(text_classifier.parameters(), lr=0.01, momentum=0.9)
            # optimizer = Adam(model.parameters(), lr=learning_rate) # Optimizer
            # TrainingLoop(model, loss_fn, optimizer, dl['twitter-training-data.txt'], dl['twitter-dev-data.txt'])
            print('Training ' + classifier)
        elif classifier == 'Bi_LSTM':
            class Bi_LSTM(torch.nn.Module) :
                def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
                    super().__init__()
                    self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
                    self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
                    self.embeddings.weight.requires_grad = False ## freeze embeddings
                    self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True,bidirectional = True)
                    self.dropout = nn.Dropout(0.2)
                    self.fc1 = nn.Linear(hidden_dim, 100)
                    self.relu = nn.ReLU()
                    self.norm1 = nn.BatchNorm1d(100)
        
                    self.fc2 = nn.Linear(100,50)
                    self.relu = nn.ReLU()
                    self.norm2 = nn.BatchNorm1d(50)
                    self.fc3 = nn.Linear(50,25)
                    self.relu = nn.ReLU()
                    self.norm3 = nn.BatchNorm1d(25)
                    self.fc4 = nn.Linear(25,2)
                    self.sig = nn.Sigmoid() # reduces performance
        
                def forward(self, x):
                    x = self.embeddings(x)
                    x = self.dropout(x)
                    lstm1_out, (ht1, ct1) = self.lstm1(x)
                    out = self.dropout(ht1[-1])
                    out = self.fc1(out)
                    out = self.relu(out)
                    out = self.norm1(out)
                    out = self.fc2(out)
                    out = self.relu(out)
                    out = self.norm2(out)
                    out = self.fc3(out)
                    out = self.relu(out)
                    out = self.norm3(out)
                    out = self.fc4(out)
                    out = self.sig(out)
                    return out
            print('Training ' + classifier)
        elif classifier == 'Bi_LSTM_GRU':
            class Bi_LSTM_GRU(torch.nn.Module) :
                def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
                    super().__init__()
                    self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
                    self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
                    self.embeddings.weight.requires_grad = False ## freeze embeddings
                    self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True,bidirectional = True)
                    self.dropout = nn.Dropout(0.2)
                    self.fc1 = nn.Linear(hidden_dim, 100)
                    self.relu = nn.ReLU()
                    self.norm1 = nn.BatchNorm1d(100)
                    self.gru = nn.GRU(100,75,batch_first=True)
                    self.fc2 = nn.Linear(75,50)
                    self.relu = nn.ReLU()
                    self.norm2 = nn.BatchNorm1d(50)
                    self.fc3 = nn.Linear(50,25)
                    self.relu = nn.ReLU()
                    self.norm3 = nn.BatchNorm1d(25)
                    self.fc4 = nn.Linear(25,2)
                    self.sig = nn.Sigmoid() # reduces performance
        
                def forward(self, x):
                    x = self.embeddings(x)
                    x = self.dropout(x)
                    lstm1_out, (ht1, ct1) = self.lstm1(x)
                    out = self.fc1(ht1[-1])
                    out = self.relu(out)
                    out = self.norm1(out)
                    out,_ = self.gru(out) #
                    out = out.reshape(out.shape[0], -1) #
                    out = self.fc2(out)
                    out = self.relu(out)
                    out = self.norm2(out)
                    out = self.fc3(out)
                    out = self.relu(out)
                    out = self.norm3(out)
                    out = self.fc4(out)
                    out = self.sig(out)
                    return out
            print('Training ' + classifier)
        else:
            print('Unknown classifier name' + classifier)
            continue

        # Predition performance of the classifiers
        for testset in testsets:
            id_preds = {}
            # write the prediction and evaluation code here
            if classifier == 'LSTM':
                pred_list = Prediction(lstm_model, loss_fn, testset)[1].tolist()
                for i in range(len(pred_list)):
                    if pred_list[i] == 0:
                        pred_list[i] ='positive'
                    else:
                        pred_list[i] ='negative'
            elif classifier == 'Bi_LSTM':
                pred_list = Prediction(bi_lstm_model, loss_fn, testset)[1].tolist()
                for i in range(len(pred_list)):
                    if pred_list[i] == 0:
                        pred_list[i] ='positive'
                    else:
                        pred_list[i] ='negative'
            elif classifier == 'Bi_LSTM_GRU':
                pred_list = Prediction(bi_lstm_gru_model, loss_fn, testset)[1].tolist()
                for i in range(len(pred_list)):
                    if pred_list[i] == 0:
                        pred_list[i] ='positive'
                    else:
                        pred_list[i] ='negative'
                
            else:
                document = []
                for dataset in ['twitter-training-data.txt', testset] :
                    document += tweets_preprocessed[dataset]
                # TF-IDF feature matrix
                tfidf = tfidf_vectorizer.fit_transform(document)
                ytrain = tweetgts['twitter-training-data.txt']
                ytest = tweetgts[testset]
                train_tfidf = tfidf[:45101,:]
                test_tfidf = tfidf[45101:,:]

                xtrain_tfidf = train_tfidf[range(len(ytrain))]
                xtest_tfidf = test_tfidf[range(len(ytest))]
                if classifier=='Naive Bayes':
                    model.fit(xtrain_tfidf.toarray(), ytrain)
                    preds = model.predict(xtest_tfidf.toarray())
                else:
                    model.fit(xtrain_tfidf, ytrain)
                    preds = model.predict(xtest_tfidf)
                pred_list = list(preds)
            
            
            id_preds = dict(zip(tweetids[testset], pred_list))

            testset_name = testset
            testset_path = join('semeval-tweets', testset_name)
            evaluate(id_preds, testset_path, features + '-' + classifier)

Training Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


semeval-tweets/twitter-test1.txt (bow-Logistic Regression): 0.467


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


semeval-tweets/twitter-test2.txt (bow-Logistic Regression): 0.495


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


semeval-tweets/twitter-test3.txt (bow-Logistic Regression): 0.456
Training Decision Tree
semeval-tweets/twitter-test1.txt (bow-Decision Tree): 0.261
semeval-tweets/twitter-test2.txt (bow-Decision Tree): 0.275
semeval-tweets/twitter-test3.txt (bow-Decision Tree): 0.255
semeval-tweets/twitter-test1.txt (bow-Naive Bayes): 0.487
semeval-tweets/twitter-test2.txt (bow-Naive Bayes): 0.509
semeval-tweets/twitter-test3.txt (bow-Naive Bayes): 0.484
Training LSTM
semeval-tweets/twitter-test1.txt (bow-LSTM): 0.492
semeval-tweets/twitter-test2.txt (bow-LSTM): 0.544
semeval-tweets/twitter-test3.txt (bow-LSTM): 0.490
Training Bi_LSTM
semeval-tweets/twitter-test1.txt (bow-Bi_LSTM): 0.544
semeval-tweets/twitter-test2.txt (bow-Bi_LSTM): 0.566
semeval-tweets/twitter-test3.txt (bow-Bi_LSTM): 0.522
Training Bi_LSTM_GRU
semeval-tweets/twitter-test1.txt (bow-Bi_LSTM_GRU): 0.543
semeval-tweets/twitter-test2.txt (bow-Bi_LSTM_GRU): 0.551
semeval-tweets/twitter-test3.txt (bow-Bi_LSTM_GRU): 0.503


In [36]:
print(classifier)
print(testset)
confusion(id_preds, testset, classifier)

Bi_LSTM_GRU
twitter-test3.txt
            positive  negative  neutral
positive    0.464     0.113     0.423     
negative    0.227     0.427     0.347     
neutral     0.000     0.000     0.000     



### Random Classifier

In [49]:
import random
testset = 'twitter-dev-data.txt'
prediction_list = list(np.zeros(len(tweetids[testset])))
for i in range(len(prediction_list)):
    prediction_list[i]=random.randint(0,1)

In [51]:
for i in range(len(prediction_list)):
    if prediction_list[i] == 0:
        prediction_list[i] ='positive'
    else:
        prediction_list[i] ='negative'

In [53]:
id_preds = dict(zip(tweetids[testset], prediction_list))
evaluate(id_preds, testset, 'Random Classifier')

twitter-dev-data.txt (Random Classifier): 0.376
