Place the two data files (multi_class.csv, multi_label.csv) in your drive /577FinalProject, or rename the directory.

In [None]:
# If this isn't being run on colab you probably want to skip this cell, and then change input dir accordingly
from google.colab import drive
drive.mount('/content/drive')

Imports

In [None]:
# If this isn't being run on colab you probably want to skip this cell
!pip install gensim
!pip install nltk
!pip install transformers

In [None]:
import gensim
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import nltk
import time

nltk.download('stopwords')
device = torch.device("cuda:0")

Hyperparameters and Globals

In [None]:
# Classes
# 😂, 🤣, 😅 are included in the report. 
#classes = ['❤', '😂', '👍', '🤣', '😅']
classes = ['😭', '😂', '😔', '🤣', '😅']

# Word2Vec Embeddings
W2V_VEC_SIZE = 300
W2V_EPOCHS = 100
W2V_MIN_COUNT = 3 # minimum word count to be included in final embeddings

# BERT Embeddings
BERT_ENTRIES_PER_CLASS = 1000
BERT_CLASS_EQ_FLAG = False
BERT_BATCH_SIZE = 32

# BOW
BOW_ENTRIES_PER_CLASS = 1000
BOW_VEC_SIZE = 500

# General
input_dir = '/content/drive/MyDrive/577FinalProject'

Identify Classes

In [None]:
# Only run this cell if you want to update the class list for any reason
# It will provide a list of classes, ranked by number of tweets
full_dataset = pd.read_csv(f"{input_dir}/multi_class.csv")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(full_dataset['Multi-Class Annotation'].value_counts())
del full_dataset

Load and Preprocess Dataset for Word2Vec

In [None]:
# Stopwords
stopwords = [] + nltk.corpus.stopwords.words('english')
stopwords += ['[MENTION]', '[URL', '[HASHTAG]', 'MENTION', 'URL', 'HASHTAG', 'RT']

In [None]:
def clean_data(dataset):
    # Tokenizer removes punctuation
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    for i in range(len(dataset)):
        item = dataset.iloc[i, 0]
        # Since this just updates the raw dataset, isinstance lets us skip the
        # cleaning process if accidentally run a second time
        if isinstance(item, str):
            tokens = tokenizer.tokenize(item)
            out = []
            for token in tokens:
                if token.lower() not in stopwords:
                    out.append(token.lower())
            dataset.at[i,'Tweet'] = out
        if i % 10000 == 0:
            print(f"{i}/{len(dataset)}")

In [None]:
# Run this cell only if full_dataset_w2v.csv exists
full_dataset_w2v = pd.read_csv(f"{input_dir}/full_dataset_w2v.csv")
full_dataset_w2v = (full_dataset_w2v[full_dataset_w2v['Multi-Class Annotation'].isin(classes)]).reset_index(drop=True)
# train_data = full_dataset_w2v[full_dataset_w2v['Dataset'] == 'train']
# test_data = full_dataset_w2v[full_dataset_w2v['Dataset'] == 'test']
# val_data = full_dataset_w2v[full_dataset_w2v['Dataset'] == 'dev']

full_dataset_w2v

In [None]:
# Run this cell if you want to rebuild cleaned csv
# If full_dataset_w2v.csv already exists, run the previous cell to load it

# full_dataset_w2v = pd.read_csv(f"{input_dir}/multi_class.csv")
full_dataset_w2v = pd.read_csv(f"577FinalProject/multi_class.csv")
full_dataset_w2v = (full_dataset_w2v[full_dataset_w2v['Multi-Class Annotation'].isin(classes)]).reset_index(drop=True)
clean_data(full_dataset_w2v)
# train_data = full_dataset_w2v[full_dataset_w2v['Dataset'] == 'train']
# test_data = full_dataset_w2v[full_dataset_w2v['Dataset'] == 'test']
# val_data = full_dataset_w2v[full_dataset_w2v['Dataset'] == 'dev']

full_dataset_w2v

Save W2V Dataset (preprocessed) to CSV

In [None]:
full_dataset_w2v.to_csv(f"{input_dir}/full_dataset_w2v.csv")

Word Embeddings (Word2Vec)


In [None]:
# Sentence to vector function
def w2v_sentence_vector(sent, w2v):
    avg = np.zeros(W2V_VEC_SIZE)
    count = 0
    for word in sent:
        if word.lower() not in stopwords:
            try:
                np.add(avg, w2v.wv[word], out=avg)
            except KeyError:
                continue
            count += 1
    if count == 0:
        return avg
    return np.true_divide(avg, count)

# Build corpus
corpus = []
for item in full_dataset_w2v.iloc[:,0]:
    corpus.append(item)

# Train w2v model
import time
start_time = time.time()
w2v = gensim.models.word2vec.Word2Vec(
    corpus,
    size=W2V_VEC_SIZE,
    window=10,
    workers=4,
    iter=W2V_EPOCHS,
    min_count=W2V_MIN_COUNT)
print(f"Training Time: {(time.time() - start_time):.1f} seconds")

# Change datasets to use embeddings

# train_w2v = train_data.copy()
# test_w2v = test_data.copy()
# val_w2v = val_data.copy()
# train_w2v['Tweet'] = train_w2v['Tweet'].apply(w2v_sentence_vector, args=[w2v])
# test_w2v['Tweet'] = test_w2v['Tweet'].apply(w2v_sentence_vector, args=[w2v])
# val_w2v['Tweet'] = val_w2v['Tweet'].apply(w2v_sentence_vector, args=[w2v])
full_w2v = full_dataset_w2v.copy()
full_w2v['Tweet'] = full_w2v['Tweet'].apply(w2v_sentence_vector, args=[w2v])

Save dataframes after w2v

In [None]:
# Old Code
#train_w2v.to_csv(f"{input_dir}/train_w2v.csv")
#test_w2v.to_csv(f"{input_dir}/test_w2v.csv")
#val_w2v.to_csv(f"{input_dir}/val_w2v.csv")

# Pickle
full_w2v.to_pickle(f"{input_dir}/w2v_embeddings.pkl")

In [None]:
full_w2v

Load and Preprocess Dataset for BERT

In [None]:
# Run this cell only if full_dataset_BERT.csv is in your drive
full_dataset_BERT = pd.read_pickle(f"{input_dir}/full_dataset_BERT.pkl")
full_dataset_BERT = (full_dataset_BERT[full_dataset_BERT['Multi-Class Annotation'].isin(classes)]).reset_index(drop=True)
full_dataset_BERT

In [None]:
def tokenize_bert(dataset):
    for i in range(len(dataset)):
        item = dataset.iloc[i, 0]
        # Since this just updates the raw dataset, isinstance lets us skip the
        # cleaning process if accidentally run a second time
        if isinstance(item, str):
            # return_tensors="pt" causes it to return pytorch tensors instead of lists
            dataset.at[i,'Tweet'] = tokenizer_bert.encode_plus(item, padding='max_length', max_length=128, truncation=True, return_tensors="pt")
        if i % 10000 == 0:
            print(f"{i}/{len(dataset)}")

In [None]:
# Skip this cell if full_dataset_BERT.csv exists and you want to just load that
# If you want to edit tokenizer parameters, run the previous cell and this one
full_dataset_BERT = pd.read_csv(f"{input_dir}/multi_class.csv")
full_dataset_BERT = (full_dataset_BERT[full_dataset_BERT['Multi-Class Annotation'].isin(classes)]).reset_index(drop=True)
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased')
tokenize_bert(full_dataset_BERT)
full_dataset_BERT

Select Entries per class (to avoid Memory issues with Colab)

In [None]:
# Run this cell if you want BERT Encodings to have equal entries per class
# This takes BERT_ENTRIES_PER_CLASS entries per each class in classes
# The above two fields are defined in the hyperparameters cell

BERT_CLASS_EQ_FLAG = True

# Initialize the partial dataset using classes[0]
partial_dataset_BERT = (full_dataset_BERT[full_dataset_BERT['Multi-Class Annotation'].isin([classes[0]])]).head(BERT_ENTRIES_PER_CLASS).reset_index(drop=True)

# Now do for every other class in classes
for i in range(1, len(classes)):
    temp = (full_dataset_BERT[full_dataset_BERT['Multi-Class Annotation'].isin([classes[i]])]).head(BERT_ENTRIES_PER_CLASS).reset_index(drop=True)
    partial_dataset_BERT = pd.concat([partial_dataset_BERT, temp])

# Reset count and drop unnamed column
partial_dataset_BERT = partial_dataset_BERT.reset_index(drop=True)

partial_dataset_BERT

Sentence Embeddings (BERT)

In [None]:
# Initialize pretrained BERT model
bert = BertModel.from_pretrained('bert-base-cased').to(device)

Sentence Embeddings (BERTForSequenceClassification)
This model doesn't actually work very well with clustering for obvious reasons
But its less painful to train!

In [None]:
# Do not run this if you want to use the above bert!
#bert = BertModel.from_pretrained('bert-base-cased', num_labels = len(classes)).to(device)

Train BERT

Commented out right now because it requires changing some specific cells earlier on to work
Since training has no measurable impact, there's no point changing the pipeline to better accomodate it

In [None]:
# def labels_to_numbers(labels):
#     out = []
#     for item in labels:
#         out.append(classes.index(item))
#     return out

# class Dataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels=None):
#         self.encodings = encodings
#         self.labels = labels
    
#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         if self.labels is not None:
#             item["labels"] = torch.tensor(self.labels[idx])
#         return item
    
#     def __len__(self):
#         return len(self.encodings["input_ids"])

# tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased')
# x_train = partial_dataset_BERT['Tweet'].to_numpy().tolist()
# x_train_tokenized = tokenizer_bert(x_train, padding=True, truncation=True, max_length=128)

# labels = np.array(labels_to_numbers(partial_dataset_BERT['Multi-Class Annotation'].to_numpy()))

# train_dataset = Dataset(x_train_tokenized)#, labels)
# val_dataset = Dataset(x_train_tokenized)#, labels)

In [None]:
# from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# # Define Trainer
# args = TrainingArguments(
#     output_dir="output",
#     evaluation_strategy="steps",
#     eval_steps=500,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     seed=0,
#     load_best_model_at_end=True,)
# trainer = Trainer(
#     model=bert,
#     args=args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],)

# # Train pre-trained model
# trainer.train()

Run BERT

In [None]:
# This is an example. This lets you fetch the vector embedding for any sentence input that you provide
# The sentence input, which is the item in the 'Tweet' column of the tokenized dataframe, contains
#   three parts: the first and last are relevant to actually running BERT on the tokenized sentence
#   This will give you two output tensors, clustering can possibly use both of them
#   Though you can probably get away with using only the second one, thats more relevant for us
## input_id = full_dataset_BERT['Tweet'][0]['input_ids']
## attention_mask = full_dataset_BERT['Tweet'][0]['attention_mask']
## print(bert(input_ids = input_id, attention_mask = attention_mask, return_dict = False))

def quick_bert(item):
    _, pooled = bert(input_ids=item['input_ids'].to(device), attention_mask=item['attention_mask'].to(device), return_dict=False)
    return pooled.cpu().detach().numpy()

start_time = time.time()
#print(bert(input_ids=partial_dataset_BERT['Tweet'][0]['input_ids'].to(device), attention_mask=partial_dataset_BERT['Tweet'][0]['attention_mask'].to(device), return_dict=False))
if (BERT_CLASS_EQ_FLAG):
    partial_dataset_BERT['Tweet'] = partial_dataset_BERT['Tweet'].apply(quick_bert)
    partial_dataset_BERT
else:
    full_dataset_BERT['Tweet'] = full_dataset_BERT['Tweet'].apply(quick_bert)
    full_dataset_BERT
print(time.time() - start_time)


Save BERT Dataset to csv

In [None]:
#train_bert = full_dataset_BERT[full_dataset_BERT['Dataset'] == 'train']
#test_bert = full_dataset_BERT[full_dataset_BERT['Dataset'] == 'test']
#val_bert = full_dataset_BERT[full_dataset_BERT['Dataset'] == 'dev']
#train_bert.to_csv(f"{input_dir}/train_BERT.csv")
#test_bert.to_csv(f"{input_dir}/test_BERT.csv")
#val_bert.to_csv(f"{input_dir}/val_BERT.csv")

#full_dataset_BERT.to_pickle(f"{input_dir}/full_dataset_BERT.pkl")
partial_dataset_BERT.to_pickle(f"{input_dir}/BERT_Embeddings.pkl")

Bag of Words

In [None]:
full_dataset_bow = pd.read_csv(f"{input_dir}/multi_class.csv")

In [None]:
# Initialize the partial dataset using classes[0]
partial_dataset_bow = (full_dataset_bow[full_dataset_bow['Multi-Class Annotation'].isin([classes[0]])]).head(BOW_ENTRIES_PER_CLASS).reset_index(drop=True)

# Now do for every other class in classes
for i in range(1, len(classes)):
    temp = (full_dataset_bow[full_dataset_bow['Multi-Class Annotation'].isin([classes[i]])]).head(BOW_ENTRIES_PER_CLASS).reset_index(drop=True)
    partial_dataset_bow = pd.concat([partial_dataset_bow, temp])

# Reset count and drop unnamed column
partial_dataset_bow = partial_dataset_bow.reset_index(drop=True)

bow_tweets = partial_dataset_bow['Tweet'].to_numpy().tolist()
bow_tweets_tok = [gensim.utils.simple_preprocess(array) for array in bow_tweets]
d = gensim.corpora.Dictionary()
bow_corpus = [d.doc2bow(sent, allow_update=True) for sent in bow_tweets_tok]
bow_corpus

In [None]:
# Check corpus length
max_cl = 0
for item in bow_corpus:
    for tup in item:
        if tup[0] > max_cl:
            max_cl = tup[0]

max_cl

8785

In [None]:
# Vectorize
vecs = []
for item in bow_corpus:
    vec = np.zeros((BOW_VEC_SIZE))
    for tup in item:
        if tup[0] < BOW_VEC_SIZE:
            vec[tup[0]] = tup[1]
    vecs.append(vec)
vecs

In [None]:
# Fill embeddings back in their correct place

for i in range(len(vecs)):
    partial_dataset_bow['Tweet'][i] = vecs[i]

partial_dataset_bow

In [None]:
# Save bow embeddings
partial_dataset_bow.to_pickle(f"{input_dir}/bow_embeddings.pkl")

Load All Embeddings in One Place (for Clustering)

In [None]:
# Load W2V embeddings
w2v_embeddings = pd.read_pickle(f"{input_dir}/w2v_embeddings.pkl")

# Load BERT Embeddings
BERT_embeddings = pd.read_pickle(f"{input_dir}/BERT_Embeddings.pkl")

# Load bow embeddings
bow_embeddings = pd.read_pickle(f"{input_dir}/bow_embeddings.pkl")

K-Means

In [None]:
# Select embeddingg
data_embeddings = w2v_embeddings
#data_embeddings = BERT_embeddings
#data_embeddings = bow_embeddings

# Number of samples in each cluster
# M = 8000 # w2v only

M = 1000

df = data_embeddings.groupby('Multi-Class Annotation').apply(lambda x:x.sample(M,random_state=0))
# all classes ['🤣', '😂', '😅', '👍', '❤']
Classes = ['🤣', '😂', '😅', '👍', '❤']
subdf = df[df['Multi-Class Annotation'].apply(lambda x:x in Classes)]
X = subdf['Tweet'].to_numpy()
Y = subdf['Multi-Class Annotation'].to_numpy()
X = np.vstack(X)

from itertools import permutations
from sklearn.metrics import f1_score

def calc_acc(ypred): 
    label_perms = permutations(set(range(len(Classes))))
    max_acc = 0
    max_f1 = 0
    for perm in list(label_perms):
        yperm = np.repeat(perm, M)
        assert(len(ypred) == len(yperm))
        perm_acc = np.mean(ypred == yperm)
        max_acc = max(max_acc, perm_acc)
        perm_f1 = f1_score(yperm, ypred, average='weighted')
        max_f1 = max(max_f1, perm_f1)
    return [max_acc, max_f1] 

Hierachical

In [None]:
# from sklearn.cluster import AgglomerativeClustering
# agglo = AgglomerativeClustering(n_clusters=len(Classes), linkage='average').fit(X)
# print(calc_acc(agglo.labels_))

# ward = AgglomerativeClustering(n_clusters=len(Classes), linkage='ward').fit(X)
# print(calc_acc(ward.labels_))

Spectral

In [None]:
# from sklearn.cluster import SpectralClustering
# # spectral = SpectralClustering(n_clusters=len(Classes), affinity='nearest_neighbors', random_state=0).fit(X)
# spectral = SpectralClustering(n_clusters=len(Classes), gamma=1e-4, random_state=0).fit(X)
# print(calc_acc(spectral.labels_))

Some 2D PCA

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit(X).transform(X)

plt.figure()
colors = []
for c in Classes:
    if c == '😂':
        colors.append("navy")
    elif c == '🤣':
        colors.append("turquoise")
    elif c == '😅':
        colors.append("green")
    elif c == '❤' or c == '😭':
        colors.append("darkorange")
    elif c == '👍' or c == '😔':
        colors.append("red")
    else:
        raise Exception("Unexpected class label")
# colors = ["navy", "darkorange", "turquoise", "red", "green"]

for color, c in zip(colors, Classes):
    plt.scatter(
        X_pca[Y==c, 0], X_pca[Y==c, 1], c=color, label=c, alpha=0.8, s=2
    )
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("PCA")


Some 3D PCA

In [None]:
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X) 
X_scaled = scaler.transform(X)

pca = PCA(n_components=3)
pca.fit(X_scaled) 
X_pca = pca.transform(X_scaled) 

Xax = X_pca[:,0]
Yax = X_pca[:,1]
Zax = X_pca[:,2]

fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111, projection='3d')

fig.patch.set_facecolor('white')

# colors = ["navy", "darkorange", "turquoise", "red", "green"]
for color, c in zip(colors, Classes):
    ix = np.where(Y == c)
    ax.scatter(Xax[ix], Yax[ix], Zax[ix], c=color, label=c, alpha=0.8, s=5)


ax.set_xlabel("PC1", fontsize=10)
ax.set_ylabel("PC2", fontsize=10)
ax.set_zlabel("PC3", fontsize=10)

# ax.legend()
plt.show()
