In [100]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import emoji
import nltk.tokenize as tk
import gensim as gsm
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, Dataset
from sklearn.model_selection import train_test_split
from collections import defaultdict
from contraction_map import contraction_map as cm

In [211]:
import random

SEED = 2022

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fc4abea8930>

## Data Loading & Preprocessing

In [140]:
data = pd.read_excel('Data/emoji2vec_data/emoji2vec_train.xlsx')[['content', 'label']]
test = pd.read_excel('Data/emoji2vec_data/emoji2vec_test.xlsx')[['content', 'label']]

### Data cleaning functions

In [209]:
# reference: https://www.kaggle.com/code/stoicstatic/twitter-sentiment-analysis-using-word2vec-bilstm 

urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern       = '@[^\s]+'
hashtagPattern    = '#[^\s]+'
alphaPattern      = "[^a-z0-9<>]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

def is_alnum_or_emoji_or_space(char):
    return char.isalnum() or emoji.is_emoji(char) or char in ('\t', ' ')

def preprocess_apply(tweet):

    tweet = tweet.lower()

    # Replace all URls with '<url>'
    tweet = re.sub(urlPattern,'<url>',tweet)
    # Replace @USERNAME to '<user>'.
    tweet = re.sub(userPattern,'<user>', tweet)
    
    # Replace 3 or more consecutive letters by 2 letter.
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    for contraction, replacement in cm.CONTRACTION_MAP.items():
        tweet = tweet.replace(contraction, replacement)

    # Remove non-alphanumeric and symbols
    tweet = ''.join(filter(is_alnum_or_emoji_or_space, tweet))

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    tweet = re.sub(r'/', ' / ', tweet)
    return tweet

# End of reference. The following code is wrote by me.

def emoji2description(text):  
    return emoji.replace_emoji(text, replace=lambda chars, data_dict: ' '.join(data_dict['en'].split('_')).strip(':'))

def emoji2concat_description(text):
    emoji_list = emoji.emoji_list(text)
    ret = emoji.replace_emoji(text, replace='').strip()
    for json in emoji_list:
        this_desc = ' '.join(emoji.EMOJI_DATA[json['emoji']]['en'].split('_')).strip(':')
        ret += ' ' + this_desc
    return ret

def extract_emojis(text):
    emoji_list = emoji.emoji_list(text)
#     print(emoji_list)
    ret = []
    for json in emoji_list:
        this_emoji = json['emoji']
        ret.append(this_emoji)
    return ' '.join(ret)

def keep_only_emojis(data):
    cnt = data['content'].apply(emoji.emoji_count)
    return data[cnt >= 1]

In [254]:
data['cleaned_content'] = data.content.apply(preprocess_apply)
test['cleaned_content'] = test.content.apply(preprocess_apply)
X,y = data['cleaned_content'].values, pd.get_dummies(data['label']).values.astype('float')
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=SEED, test_size=0.2)
X_test, y_test = test['cleaned_content'].values, pd.get_dummies(test['label']).values.astype('float')
print(f'shape of train data is {X_train.shape}')
print(f'shape of test data is {X_test.shape}')

shape of train data is (41343,)
shape of test data is (12920,)


In [232]:
class TweetDataset(Dataset):

    def __init__(self, tweets, targets, tokenizer, max_len, e2v, w2v):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.e2v = e2v
        self.w2v = w2v

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        target = self.targets[item]

        tokens = self.tokenizer.tokenize(tweet)
        
        seq = []
        for t in tokens:
            if t in e2v.key_to_index:
                seq.append(torch.from_numpy(e2v[t]))
            elif t in w2v.key_to_index:
                seq.append(torch.from_numpy(w2v[t]))
        
        
        padding_length = self.max_len - len(seq)
        for _ in range(padding_length):
            seq.append(torch.zeros(300,))
        seq = torch.stack(seq, dim=0)
        
        return seq, target

def create_data_loader(X, y, tokenizer, max_len, batch_size, e2v, w2v):
    ds = TweetDataset(
    tweets=X,
    targets=y,
    tokenizer=tokenizer,
    max_len=max_len,
    e2v=e2v,
    w2v=w2v
    )

    return DataLoader(
    ds,
    batch_size=batch_size)

In [233]:
# Load the word2vec and emoji2vec models
e2v_path = 'Data/emoji2vec_data/emoji2vec.bin'
w2v_path = 'Data/emoji2vec_data/GoogleNews-vectors-negative300.bin.gz'
w2v = gsm.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
e2v = gsm.models.KeyedVectors.load_word2vec_format(e2v_path, binary=True)

In [255]:
TweetTknzr = tk.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
MAX_LEN = 128
BATCH_SIZE = 64

train_data_loader = create_data_loader(X_train, y_train, TweetTknzr, MAX_LEN, BATCH_SIZE, e2v, w2v)
val_data_loader = create_data_loader(X_val, y_val, TweetTknzr, MAX_LEN, BATCH_SIZE, e2v, w2v)

dataiter = iter(train_data_loader)
sample_inputs, sample_targets = dataiter.next()
print("Sample batch shape:", sample_inputs.shape, sample_targets.shape)

Sample batch shape: torch.Size([64, 128, 300]) torch.Size([64, 3])


## Neural Network Building

In [243]:
class BiLSTM_FFF(nn.Module):
    def __init__(self, hidden_dim, bidirectional, embedding_dim):
        super(BiLSTM_FFF, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.bidir = bidirectional
        self.lstm = nn.LSTM(input_size=embedding_dim,
                           hidden_size=self.hidden_dim,
                           num_layers=2,
                           bidirectional=self.bidir,
                           batch_first=True)
        self.drop = nn.Dropout(p=0.25)
        
        self.out = nn.Sequential(
                    nn.Linear(self.hidden_dim * 2, 512), # since it's bidirectional 
                    nn.ReLU(),
                    nn.Linear(512, 3)
                   )
    
    def forward(self, seq):
        _, (out,_) = self.lstm(seq)
        out = torch.cat((out[-2,:,:], out[-1,:,:]), dim = 1)
        out = self.drop(out)
        return self.out(out)

In [256]:
embedding_dim = 300
hidden_dim = 512
bidirectional = True
EPOCHS = 20
lr=0.001
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

decoder = BiLSTM_FFF(hidden_dim, bidirectional, embedding_dim)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)


print(decoder)

cpu
BiLSTM_FFF(
  (lstm): LSTM(300, 512, num_layers=2, batch_first=True, bidirectional=True)
  (drop): Dropout(p=0.25, inplace=False)
  (out): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=3, bias=True)
  )
)


## Training & Evaluating

In [226]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  n_examples
):
    model = model.train()

    losses = []
    correct_predictions = 0

    for inputs, targets in tqdm(data_loader):
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model(inputs)
#         print(outputs.dtype)
        _,preds = torch.max(outputs, dim = 1)
        loss = loss_fn(outputs, targets)
#         prediction_error += torch.sum(torch.abs(targets - outputs))
        correct_predictions += torch.sum(preds == torch.max(targets, dim = 1)[1])
        print(f'Iteration loss: {loss.item()}')
        losses.append(loss.item())

        loss.backward()
#         nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)
#     return np.mean(losses), np.mean(losses)

In [258]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for inputs, targets in data_loader:
            
            inputs = inputs.to(device)
            targets = targets.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, dim=1)
#             prediction_error += torch.sum(torch.abs(targets - outputs))
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == torch.max(targets, dim = 1)[1])
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)
#     return nn.functional.cosine_similarity(output_all, target_all, dim=0), np.mean(losses)

In [257]:
history = defaultdict(list)
best_accuracy = 0
decoder.to(device)

for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
    decoder,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device,
    len(X_train)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
    decoder,
    val_data_loader,
    loss_fn, 
    device, 
    len(X_val)
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(decoder.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

  0%|          | 0/646 [00:00<?, ?it/s]

Epoch 1/20
----------
Iteration loss: 1.098761623725295


  0%|          | 1/646 [00:02<28:20,  2.64s/it]

Iteration loss: 1.095160878263414


  0%|          | 2/646 [00:05<27:35,  2.57s/it]

Iteration loss: 1.0696086063981056


  0%|          | 3/646 [00:07<27:18,  2.55s/it]

Iteration loss: 1.062403704971075


  1%|          | 4/646 [00:10<26:56,  2.52s/it]

Iteration loss: 1.1182066556066275


  1%|          | 5/646 [00:12<27:21,  2.56s/it]

Iteration loss: 1.0317147057503462


  1%|          | 6/646 [00:15<27:51,  2.61s/it]

Iteration loss: 1.0284478440880775


  1%|          | 7/646 [00:18<28:40,  2.69s/it]

Iteration loss: 1.0852249879390001


  1%|          | 8/646 [00:21<28:49,  2.71s/it]

Iteration loss: 1.0720456540584564


  1%|          | 8/646 [00:23<31:46,  2.99s/it]


KeyboardInterrupt: 