In [100]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import emoji
import nltk.tokenize as tk
import gensim as gsm
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, Dataset
from sklearn.model_selection import train_test_split
from collections import defaultdict
from contraction_map import contraction_map as cm

In [211]:
import random

SEED = 2022

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fc4abea8930>

## Data Loading & Preprocessing

In [140]:
data = pd.read_excel('Data/emoji2vec_data/emoji2vec_train.xlsx')[['content', 'label']]
test = pd.read_excel('Data/emoji2vec_data/emoji2vec_test.xlsx')[['content', 'label']]

### Data cleaning functions

In [209]:
# reference: https://www.kaggle.com/code/stoicstatic/twitter-sentiment-analysis-using-word2vec-bilstm 

urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern       = '@[^\s]+'
hashtagPattern    = '#[^\s]+'
alphaPattern      = "[^a-z0-9<>]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

def is_alnum_or_emoji_or_space(char):
    return char.isalnum() or emoji.is_emoji(char) or char in ('\t', ' ')

def preprocess_apply(tweet):

    tweet = tweet.lower()

    # Replace all URls with '<url>'
    tweet = re.sub(urlPattern,'<url>',tweet)
    # Replace @USERNAME to '<user>'.
    tweet = re.sub(userPattern,'<user>', tweet)
    
    # Replace 3 or more consecutive letters by 2 letter.
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    for contraction, replacement in cm.CONTRACTION_MAP.items():
        tweet = tweet.replace(contraction, replacement)

    # Remove non-alphanumeric and symbols
    tweet = ''.join(filter(is_alnum_or_emoji_or_space, tweet))

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    tweet = re.sub(r'/', ' / ', tweet)
    return tweet

# End of reference. The following code is wrote by me.

def emoji2description(text):  
    return emoji.replace_emoji(text, replace=lambda chars, data_dict: ' '.join(data_dict['en'].split('_')).strip(':'))

def emoji2concat_description(text):
    emoji_list = emoji.emoji_list(text)
    ret = emoji.replace_emoji(text, replace='').strip()
    for json in emoji_list:
        this_desc = ' '.join(emoji.EMOJI_DATA[json['emoji']]['en'].split('_')).strip(':')
        ret += ' ' + this_desc
    return ret

def extract_emojis(text):
    emoji_list = emoji.emoji_list(text)
#     print(emoji_list)
    ret = []
    for json in emoji_list:
        this_emoji = json['emoji']
        ret.append(this_emoji)
    return ' '.join(ret)

def keep_only_emojis(data):
    cnt = data['content'].apply(emoji.emoji_count)
    return data[cnt >= 1]

In [213]:
data['cleaned_content'] = data.content.apply(preprocess_apply)
test['cleaned_content'] = test.content.apply(preprocess_apply)
X,y = data['cleaned_content'].values, pd.get_dummies(data['label']).values
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=SEED, test_size=0.2)
X_test, y_test = test['cleaned_content'].values, pd.get_dummies(test['label']).values
print(f'shape of train data is {X_train.shape}')
print(f'shape of test data is {X_test.shape}')

shape of train data is (41343,)
shape of test data is (12920,)


In [214]:
class TweetDataset(Dataset):

    def __init__(self, tweets, targets, tokenizer, max_len, e2v, w2v):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.e2v = e2v
        self.w2v = w2v

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        target = self.targets[item]

        tokens = self.tokenizer.tokenize(tweet)
        
        seq = []
        for t in tokens:
            if t in e2v.key_to_index:
                seq.append(torch.from_numpy(e2v[t]))
            elif t in w2v.key_to_index:
                seq.append(torch.from_numpy(w2v[t]))
        
        
        padding_length = self.max_len - len(seq)
        for _ in range(padding_length):
            seq.append(torch.zeros(300,))
        seq = torch.stack(seq, dim=0)
        
        return seq

def create_data_loader(X, y, tokenizer, max_len, batch_size, e2v, w2v):
    ds = TweetDataset(
    tweets=X,
    targets=y,
    tokenizer=tokenizer,
    max_len=max_len,
    e2v=e2v,
    w2v=w2v
    )

    return DataLoader(
    ds,
    batch_size=batch_size)

In [None]:
# Load the word2vec and emoji2vec models
e2v_path = 'Data/emoji2vec_data/emoji2vec.bin'
w2v_path = 'Data/emoji2vec_data/GoogleNews-vectors-negative300.bin.gz'
w2v = gsm.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
e2v = gsm.models.KeyedVectors.load_word2vec_format(e2v_path, binary=True)

In [215]:
TweetTknzr = tk.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
MAX_LEN = 128
BATCH_SIZE = 64

train_data_loader = create_data_loader(X_train, y_train, TweetTknzr, MAX_LEN, BATCH_SIZE, e2v, w2v)
val_data_loader = create_data_loader(X_val, y_val, TweetTknzr, MAX_LEN, BATCH_SIZE, e2v, w2v)

dataiter = iter(train_data_loader)
sample = dataiter.next()
print("Sample batch shape:", sample.shape)

Sample batch shape: torch.Size([64, 128, 300])


## Neural Network Building

## Training & Evaluating