## Import Libraries

In [2]:
import multiprocessing
import random
import re

import contractions
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
from gensim.models import KeyedVectors, Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.sparse import load_npz, save_npz
from sklearn import utils
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import tensorflow.keras.layers as layers
from sentence_transformers import SentenceTransformer
from tqdm import tqdm, trange

import umap
import torch
from functools import reduce
from tensorflow import keras
from tensorflow.keras import layers

# Download required nltk data
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/PoYan1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /Users/PoYan1/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/PoYan1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/PoYan1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/PoYan1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Import Dataset

In [6]:
df = pd.read_csv('./dataset/SmartLocal/smartlocal_text.csv')
rand_state = 42
text_cols = ['title', 'preview', 'content']
df

Unnamed: 0,title,preview,content,num_shares_bin
0,kampong gelam bazaar 2023 light projection sul...,there elevated mezzanine seating area fairy li...,kampong gelam bazaar 2023 every ramadan muslim...,Low Shares
1,headrock vr virtual reality theme park sentosa...,the hyperrealistic virtual skyscraper game sen...,headrock vr singapore looking fun activity ide...,Low Shares
2,30 free steam game add library coop shooter ho...,thankfully great game require portion paycheck...,free steam game play home gamers singapore agr...,High Shares
3,marina south pier ferry ride southern island r...,if looking outdoor adventure beyond location l...,marina south pier guide looking outdoor advent...,High Shares
4,tayo station massive indoor playground multi z...,fan tayo little bus right way please,tayo station downtown east kid want play care ...,Low Shares
...,...,...,...,...
3987,12 best lobangs july 2019 1for1 mookata buffet...,enjoy july 2019 handful lobangs stretch dollar...,best deal singapore july 2019 karaoke manekine...,High Shares
3988,5 bathroom fitting hdb singapore give 5star sh...,how make bathroom luxe would taking extralong ...,bathroom fitting hbd singapore hansgrohe long ...,High Shares
3989,capri fraser new igfriendly hotel cbd gym pool...,the new capri fraser china square staycays res...,the new capri fraser china square staycays res...,High Shares
3990,20 thing july 2019 50cents fest street fighter...,thing july 2019 like half year gone smack midd...,thing july 2019 like half year gone smack midd...,High Shares


In [7]:
def remove_start_end(text):
    text = re.sub('Follow us on Telegram for the latest updates: https://t.me/TSLMedia ', '', text)
    text = re.sub(" Get more stories like this. Drop us your email so you won't miss the latest news. Name Your Name Email Your Email Subscribe", '', text)
    return text

def lemmatize(text: str) -> str:
    """Converts a text into its lemmatized form."""
    wnl = WordNetLemmatizer()
    return ' '.join([wnl.lemmatize(word) for word in text.split()])

def remove_punctuations(text: str) -> str:
    """Removes punctuations and keeps all alphanumerical text."""
    return re.sub(r'[^\w\s]', '', text)

def remove_tags(text):
    # removes social media tags
    return re.sub('@\w+', '', text.lower())

def remove_stopwords(text, stopwords):
    """Removes stopwords from text."""
    for s in stopwords:
        pattern = f' {s} '
        text = re.sub(pattern, ' ', text)
    return text

In [9]:
# Default stop words
stop_words = stopwords.words('english')

# Extend stop words
stop_words = ['cover image adapted from', 'cover image credits',
              'cover image credit', 'image credits', 'image credit',
              'image adapted from', 'photography by', 'ha', 'wa'] + stop_words

# List of default stopwords with punctuations removed, to avoid issues of encoding
stop_words_no_punc = [remove_punctuations(word) for word in stop_words]

functions = [remove_start_end, remove_tags, contractions.fix, remove_punctuations, 
             lambda z: remove_stopwords(text=z, stopwords=stop_words), 
             lemmatize]
mass_apply = lambda x: reduce(lambda y, f: f(y), functions, x)

# Process title, preview and content
for text_col in text_cols:
    df[text_col] \
        .apply(mass_apply) \
        .to_csv(f'./dataset/{text_col}_processed.csv', index=False)

# dataset/text_embedding

In [11]:
text_processed = {}
for text_col in text_cols:
    train_df, test_df = train_test_split(pd.read_csv(f'./dataset/{text_col}_processed.csv'), test_size=0.2, random_state=rand_state)
    text_processed[text_col] = {'train': train_df.to_numpy(), 'test': test_df.to_numpy()}

In [31]:
def avg_word2vec(model, model_vocabs, tokenized_sentences, size):
    try:
        list_of_wv = [[model.wv[token] for token in sentence if token in model_vocabs] for sentence in tokenized_sentences]
    except:
        list_of_wv = [[model.get_vector(token) for token in sentence if token in model_vocabs] for sentence in tokenized_sentences]
    
    list_of_avg_wv = []
    for wvs in list_of_wv:
        wvs = np.array(wvs)
        if len(wvs) > 0:
            list_of_avg_wv.append(wvs.mean(axis=0))
        else:
            list_of_avg_wv.append(np.zeros(size, dtype=float))
    return np.array(list_of_avg_wv)

def tagged_document(post):
    return TaggedDocument(words=post)

def avg_doc2vec(model, tagged_docs, category_dict):
    sents = tagged_docs.values
    targets, regressors = zip(*[(category_dict[doc.tags], model.infer_vector(doc.words)) for doc in sents])
    return np.array(regressors)


def bert_dist_embed(posts, tokenizer, model):
    embedding_res = np.empty(shape=(0, 768))
    for batch_no in trange(0, len(posts), 100):
        tokenized = tokenizer(list(posts[batch_no:batch_no+100]), 
                              padding = True, 
                              truncation = True, 
                              return_tensors = "pt")
    with torch.no_grad():
        hidden = model(**tokenized)
        
    batch = hidden.last_hidden_state[:,0,:].cpu().detach().numpy()
    embedding_res = np.append(embedding_res, batch, axis=0)
    return embedding_res

### 1. Trained Word2Vec (Skip-Gram, CBOW)

In [22]:
for text_col in text_cols:
    train = [sentence[0].split() for sentence in text_processed[text_col]['train']]
    test = [sentence[0].split() for sentence in text_processed[text_col]['test']]
    
    skip_gram_model = Word2Vec(train, sg=1, min_count=1)
    skipgram_wordpool = set(skip_gram_model.wv.index_to_key)
    X_train_sg = avg_word2vec(skip_gram_model, skipgram_wordpool, train, 100)
    X_test_sg = avg_word2vec(skip_gram_model, skipgram_wordpool, test, 100)
    pd.DataFrame(X_train_sg).to_csv(f'./dataset/text_embedding/{text_col}/emb_sg_train.csv', index=False)
    pd.DataFrame(X_test_sg).to_csv(f'./dataset/text_embedding/{text_col}/emb_sg_test.csv', index=False)
    
    cbow_model = Word2Vec(train, sg=0, min_count=1)
    dbow_wordpool = set(cbow_model.wv.index_to_key)
    X_train_cbow = avg_word2vec(cbow_model, dbow_wordpool, train, 100)
    X_test_cbow = avg_word2vec(cbow_model, dbow_wordpool, test, 100)
    pd.DataFrame(X_train_cbow).to_csv(f'./dataset/text_embedding/{text_col}/emb_cbow_train.csv', index=False)
    pd.DataFrame(X_test_cbow).to_csv(f'./dataset/text_embedding/{text_col}/emb_cbow_test.csv', index=False)
    
    google_file = './dataset/Pretrained Embedding Model/GoogleNews-vectors-negative300.bin'
    google_model = KeyedVectors.load_word2vec_format(google_file, binary=True)
    google_wordpool = set(google_model.index_to_key)
    X_train_ggl = avg_word2vec(google_model, google_wordpool, train, 300)
    X_test_ggl = avg_word2vec(google_model, google_wordpool, test, 300)
    pd.DataFrame(X_train_ggl).to_csv(f'./dataset/text_embedding/{text_col}/emb_ggl_train.csv', index=False)
    pd.DataFrame(X_test_ggl).to_csv(f'./dataset/text_embedding/{text_col}/emb_ggl_test.csv', index=False)
    
    glove_file = './dataset/Pretrained Embedding Model/glove.6B.100d.txt'
    glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)
    glove_wordpool = set(glove_model.index_to_key)
    X_train_glove = avg_word2vec(glove_model, glove_wordpool, train, 100)
    X_test_glove = avg_word2vec(glove_model, glove_wordpool, test, 100)
    pd.DataFrame(X_train_glove).to_csv(f'./dataset/text_embedding/{text_col}/emb_glove_train.csv', index=False)
    pd.DataFrame(X_test_glove).to_csv(f'./dataset/text_embedding/{text_col}/emb_glove_test.csv', index=False)
    
    print(X_train_sg.shape, X_test_sg.shape, X_train_cbow.shape, X_test_cbow.shape, X_train_ggl.shape, X_test_ggl.shape, X_train_glove.shape, X_test_glove.shape)

(3193, 100) (799, 100) (3193, 100) (799, 100) (3193, 300) (799, 300) (3193, 100) (799, 100)
(3193, 100) (799, 100) (3193, 100) (799, 100) (3193, 300) (799, 300) (3193, 100) (799, 100)
(3193, 100) (799, 100) (3193, 100) (799, 100) (3193, 300) (799, 300) (3193, 100) (799, 100)


### 2. Trained Doc2Vec

In [19]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=rand_state)
# represent each sentence as a TaggedDocument containing 2 parameters, words=tokenized_sentence and tag=label
train_r = pd.DataFrame({'num_shares_bin': train_df['num_shares_bin'].to_numpy(), 'title': train_df['title'].to_numpy()}, columns=['num_shares_bin', 'title'])
test_r = pd.DataFrame({'num_shares_bin': test_df['num_shares_bin'].to_numpy(), 'title': test_df['title'].to_numpy()}, columns=['num_shares_bin', 'title'])
train_tagged = train_r.apply(lambda r: TaggedDocument(words=r['title'], tags=[r['num_shares_bin']]), axis=1)
test_tagged = test_r.apply(lambda r: TaggedDocument(words=r['title'], tags=[r['num_shares_bin']]), axis=1)

# visualize a TaggedDocument
train_tagged[0]

TaggedDocument(words='qoo10 deal 099 brand like koi shihlin plus chance score new iphone 14', tags=['Low Shares'])

In [34]:
# use multiple cores
cores = multiprocessing.cpu_count()

# Define the category dictionary
category_dict = {'Low Shares': 0, 'Average Shares': 1, 'High Shares': 2}

for text_col in text_cols:
    train = [sentence[0].split() for sentence in text_processed[text_col]['train']]
    test = [sentence[0].split() for sentence in text_processed[text_col]['test']]
    
    # represent each sentence as a TaggedDocument containing 2 parameters, words=tokenized_sentence and tag=label
    train_r = pd.DataFrame({'num_shares_bin': train_df['num_shares_bin'].astype(str), text_col: train}, columns=['num_shares_bin', text_col])
    test_r = pd.DataFrame({'num_shares_bin': test_df['num_shares_bin'].astype(str), text_col: test}, columns=['num_shares_bin', text_col])
    train_tagged = train_r.apply(lambda r: TaggedDocument(words=r[text_col], tags=r['num_shares_bin']), axis=1)
    test_tagged = test_r.apply(lambda r: TaggedDocument(words=r[text_col], tags=r['num_shares_bin']), axis=1)

    # implement Distributed Bag of Words (DBOW) (similar concept to skip-gram)
    model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
    model_dbow.build_vocab([x for x in tqdm(train_tagged.values, disable=True)])
    for epoch in range(30):
        model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values, disable=True)]), total_examples=len(train_tagged.values), epochs=1)
        model_dbow.alpha -= 0.002
        model_dbow.min_alpha = model_dbow.alpha

    # implement Distributed Memory (DM) (similar concept to CBOW)
    model_dm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
    model_dm.build_vocab([x for x in tqdm(train_tagged.values, disable=True)])
    for epoch in range(30):
        model_dm.train(utils.shuffle([x for x in tqdm(train_tagged.values, disable=True)]), total_examples=len(train_tagged.values), epochs=1)
        model_dm.alpha -= 0.002
        model_dm.min_alpha = model_dm.alpha
        
    # dataset/text_embedding using dbow
    X_train_dbow = avg_doc2vec(model_dbow, train_tagged, category_dict)
    X_test_dbow = avg_doc2vec(model_dbow, test_tagged, category_dict)

    # dataset/text_embedding using dm
    X_train_dm = avg_doc2vec(model_dm, train_tagged, category_dict)
    X_test_dm = avg_doc2vec(model_dm, test_tagged, category_dict)

    # dataset/text_embedding by combining a paragraph vector from DBOW and DM to improve performance
    model_dbow.wv.fill_norms(force=True)
    model_dm.wv.fill_norms(force=True)
    model_dbow_dm = ConcatenatedDoc2Vec([model_dbow, model_dm])
    X_train_dbow_dm = avg_doc2vec(model_dbow_dm, train_tagged, category_dict)
    X_test_dbow_dm = avg_doc2vec(model_dbow_dm, test_tagged, category_dict)
    
    # export to csv
    pd.DataFrame(X_train_dbow).to_csv(f'./dataset/text_embedding/{text_col}/emb_dbow_train.csv', index=False)
    pd.DataFrame(X_test_dbow).to_csv(f'./dataset/text_embedding/{text_col}/emb_dbow_test.csv', index=False)
    pd.DataFrame(X_train_dm).to_csv(f'./dataset/text_embedding/{text_col}/emb_dm_train.csv', index=False)
    pd.DataFrame(X_test_dm).to_csv(f'./dataset/text_embedding/{text_col}/emb_dm_test.csv', index=False)
    pd.DataFrame(X_train_dbow_dm).to_csv(f'./dataset/text_embedding/{text_col}/emb_dbow_dm_train.csv', index=False)
    pd.DataFrame(X_test_dbow_dm).to_csv(f'./dataset/text_embedding/{text_col}/emb_dbow_dm_test.csv', index=False)
    
    print(X_train_dbow.shape, X_test_dbow.shape, X_train_dm.shape, X_test_dm.shape, X_train_dbow_dm.shape, X_test_dbow_dm.shape)

(3193, 300) (799, 300) (3193, 300) (799, 300) (3193, 600) (799, 600)
(3193, 300) (799, 300) (3193, 300) (799, 300) (3193, 600) (799, 600)
(3193, 300) (799, 300) (3193, 300) (799, 300) (3193, 600) (799, 600)


### 3. TF-IDF

In [35]:
for text_col in text_cols:
    vectorizer = TfidfVectorizer(ngram_range = (2, 2))
    X_train_tfidf = vectorizer.fit_transform(map(lambda x: ' '.join(x), train))
    X_test_tfidf = vectorizer.transform(map(lambda x: ' '.join(x), test))
    
    save_npz(f'./dataset/text_embedding/{text_col}/emb_tfidf_train.npz', X_train_tfidf)
    save_npz(f'./dataset/text_embedding/{text_col}/emb_tfidf_test.npz', X_test_tfidf)

    print(X_train_tfidf.shape, X_test_tfidf.shape)

(3193, 1423534) (799, 1423534)
(3193, 1423534) (799, 1423534)
(3193, 1423534) (799, 1423534)


### 4. BERT

In [36]:
embedder = SentenceTransformer('all-mpnet-base-v2')
for text_col in text_cols:
    train = text_processed[text_col]['train'].reshape(-1)
    test = text_processed[text_col]['test'].reshape(-1)

    X_train_bert = embedder.encode(train)
    X_test_bert = embedder.encode(test)
    X_train_bert = pd.DataFrame(X_train_bert)
    X_test_bert = pd.DataFrame(X_test_bert)
    pd.DataFrame(X_train_bert).to_csv(f'./dataset/text_embedding/{text_col}/emb_bert_train.csv', index=False)
    pd.DataFrame(X_test_bert).to_csv(f'./dataset/text_embedding/{text_col}/emb_bert_test.csv', index=False) 
    
    print(X_train_bert.shape, X_test_bert.shape)

(3193, 768) (799, 768)
(3193, 768) (799, 768)
(3193, 768) (799, 768)


# Evaluation


In [37]:
length = 5

# # word2vec: skip-gram
X_train_sg = []
X_test_sg = []
for text_col in text_cols:
    dim = umap.UMAP(n_components=length)
    
    X_train_sg_text_col = pd.read_csv(f'./dataset/text_embedding/{text_col}/emb_sg_train.csv')
    X_train_sg_text_col = pd.DataFrame(dim.fit_transform(X_train_sg_text_col))
    X_train_sg.append(X_train_sg_text_col)
    
    X_test_sg_test_col = pd.read_csv(f'./dataset/text_embedding/{text_col}/emb_sg_test.csv')
    X_test_sg_test_col = pd.DataFrame(dim.transform(X_test_sg_test_col))
    X_test_sg.append(X_test_sg_test_col)
X_train_sg = pd.concat(X_train_sg, axis=1)
X_test_sg = pd.concat(X_test_sg, axis=1)
    
# word2vec: cbow
X_train_cbow = []
X_test_cbow = []
for text_col in text_cols:
    dim = umap.UMAP(n_components=length)
    
    X_train_cbow_text_col = pd.read_csv(f'./dataset/text_embedding/{text_col}/emb_cbow_train.csv')
    X_train_cbow_text_col = pd.DataFrame(dim.fit_transform(X_train_cbow_text_col))
    X_train_cbow.append(X_train_cbow_text_col)
    
    X_test_cbow_text_col = pd.read_csv(f'./dataset/text_embedding/{text_col}/emb_cbow_test.csv')
    X_test_cbow_text_col = pd.DataFrame(dim.transform(X_test_cbow_text_col))
    X_test_cbow.append(X_test_cbow_text_col)
X_train_cbow = pd.concat(X_train_cbow, axis=1)
X_test_cbow = pd.concat(X_test_cbow, axis=1)

# doc2vec: dbow
X_train_dbow = []
X_test_dbow = []
for text_col in text_cols:
    dim = umap.UMAP(n_components=length)
    
    X_train_dbow_text_col = pd.read_csv(f'./dataset/text_embedding/{text_col}/emb_dbow_train.csv')
    X_train_dbow_text_col = pd.DataFrame(dim.fit_transform(X_train_dbow_text_col))
    X_train_dbow.append(X_train_dbow_text_col)
    
    X_test_dbow_text_col = pd.read_csv(f'./dataset/text_embedding/{text_col}/emb_dbow_test.csv')
    X_test_dbow_text_col = pd.DataFrame(dim.transform(X_test_dbow_text_col))
    X_test_dbow.append(X_test_dbow_text_col)
X_train_dbow = pd.concat(X_train_dbow, axis=1)
X_test_dbow = pd.concat(X_test_dbow, axis=1)

# doc2vec: dm
X_train_dm = []
X_test_dm = []
for text_col in text_cols:
    dim = umap.UMAP(n_components=length)
    
    X_train_dm_text_col = pd.read_csv(f'./dataset/text_embedding/{text_col}/emb_dm_train.csv')
    X_train_dm_text_col = pd.DataFrame(dim.fit_transform(X_train_dm_text_col))
    X_train_dm.append(X_train_dm_text_col)
    
    X_test_dm_text_col = pd.read_csv(f'./dataset/text_embedding/{text_col}/emb_dm_test.csv')
    X_test_dm_text_col = pd.DataFrame(dim.transform(X_test_dm_text_col))
    X_test_dm.append(X_test_dm_text_col)
X_train_dm = pd.concat(X_train_dm, axis=1)
X_test_dm = pd.concat(X_test_dm, axis=1)

# doc2vec: dbow + dm
X_train_dbow_dm = []
X_test_dbow_dm = []
for text_col in text_cols:
    dim = umap.UMAP(n_components=length)
    
    X_train_dbow_dm_text_col = pd.read_csv(f'./dataset/text_embedding/{text_col}/emb_dbow_dm_train.csv')
    X_train_dbow_dm_text_col = pd.DataFrame(dim.fit_transform(X_train_dbow_dm_text_col))
    X_train_dbow_dm.append(X_train_dbow_dm_text_col)
    
    X_test_dbow_dm_text_col = pd.read_csv(f'./dataset/text_embedding/{text_col}/emb_dbow_dm_test.csv')
    X_test_dbow_dm_text_col = pd.DataFrame(dim.transform(X_test_dbow_dm_text_col))
    X_test_dbow_dm.append(X_test_dbow_dm_text_col)
X_train_dbow_dm = pd.concat(X_train_dbow_dm, axis=1)
X_test_dbow_dm = pd.concat(X_test_dbow_dm, axis=1)

# tf-idf
X_train_tfidf = []
X_test_tfidf = []
for text_col in text_cols:
    dim = umap.UMAP(n_components=length)
    
    X_train_tfidf_text_col = load_npz(f'./dataset/text_embedding/{text_col}/emb_tfidf_train.npz')
    X_train_tfidf_text_col = pd.DataFrame(dim.fit_transform(X_train_tfidf_text_col))
    X_train_tfidf.append(X_train_tfidf_text_col)
    
    X_test_tfidf_text_col = load_npz(f'./dataset/text_embedding/{text_col}/emb_tfidf_test.npz')
    X_test_tfidf_text_col = pd.DataFrame(dim.transform(X_test_tfidf_text_col))
    X_test_tfidf.append(X_test_tfidf_text_col)
X_train_tfidf = pd.concat(X_train_tfidf, axis=1)
X_test_tfidf = pd.concat(X_test_tfidf, axis=1)

# bert
X_train_bert = []
X_test_bert = []
for text_col in text_cols:
    dim = umap.UMAP(n_components=length)
    
    X_train_bert_text_col = pd.read_csv(f'./dataset/text_embedding/{text_col}/emb_bert_train.csv')
    X_train_bert_text_col = pd.DataFrame(dim.fit_transform(X_train_bert_text_col))
    X_train_bert.append(X_train_bert_text_col)
    
    X_test_bert_text_col = pd.read_csv(f'./dataset/text_embedding/{text_col}/emb_bert_test.csv')
    X_test_bert_text_col = pd.DataFrame(dim.transform(X_test_bert_text_col))
    X_test_bert.append(X_test_bert_text_col)
X_train_bert = pd.concat(X_train_bert, axis=1)
X_test_bert = pd.concat(X_test_bert, axis=1)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [38]:
# labels
y_train = train_df['num_shares_bin']
y_test = test_df['num_shares_bin']

In [99]:
# deep learning model classes
from sklearn.preprocessing import LabelEncoder

class LSTM:
  def __init__(self):
    self.model = None
    self.le = LabelEncoder()

  def fit(self, X_train, y_train):
    y_train_encoded = self.le.fit_transform(y_train)
    self.model = keras.Sequential([
        layers.Input(shape=(X_train.shape[1],1)),
        layers.SpatialDropout1D(0.2),
        layers.LSTM(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
        layers.LSTM(200, recurrent_dropout=0.2, return_sequences=True),
        layers.GlobalMaxPooling1D(),
        layers.Dense(300, activation='relu'),
        layers.Dense(300, activation='relu'),
        layers.Dense(3, activation='softmax')
      ])
    self.model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    self.model.fit(X_train, y_train_encoded, batch_size=200, epochs=6, verbose=0)

  def predict(self, X_test):
    y_pred_encoded = self.model.predict(X_test).argmax(axis=-1)
    y_pred = self.le.inverse_transform(y_pred_encoded)
    return y_pred

  def evaluate(self, X_test, y_test):
    y_test_encoded = self.le.transform(y_test)
    return self.model.evaluate(X_test, y_test_encoded, verbose=0)
  

class CNN:
  def __init__(self):
    self.model = None
    self.le = LabelEncoder()

  def fit(self, X_train, y_train):
    y_train = self.le.fit_transform(y_train)
    model = keras.Sequential([
        layers.Input(shape=(X_train.shape[1],1)),
        layers.Conv1D(filters=128, kernel_size=5, strides=1, activation='relu', padding='same'),
        layers.GlobalMaxPooling1D(),
        layers.Dense(500, activation='relu'), # FCNN
        layers.Dropout(0.3),
        layers.Dense(500, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(3, activation='softmax', name = 'Output') # output 
      ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, batch_size=200, epochs=6, verbose=0)
    self.model = model

  def predict(self, X_test):
    y_pred = self.model.predict(X_test)
    return self.le.inverse_transform(np.argmax(y_pred, axis=1))

  def evaluate(self, X_test, y_test):
    y_test = self.le.transform(y_test)
    return self.model.evaluate(X_test, y_test, verbose=0)


In [100]:
# all embeddings
embedding_dict = {
    'Word2Vec: Skip-Gram' : {'X_train': X_train_sg, 'y_train': y_train, 'X_test': X_test_sg, 'y_test': y_test},
    'Word2Vec: CBOW' : {'X_train': X_train_cbow, 'y_train': y_train, 'X_test': X_test_cbow, 'y_test': y_test},
    'Doc2Vec: DBOW' : {'X_train': X_train_dbow, 'y_train': y_train, 'X_test': X_test_dbow, 'y_test': y_test},
    'Doc2Vec: DM' : {'X_train': X_train_dm, 'y_train': y_train, 'X_test': X_test_dm, 'y_test': y_test},
    'Doc2Vec: DBOW+DM' : {'X_train': X_train_dbow_dm, 'y_train': y_train, 'X_test': X_test_dbow_dm, 'y_test': y_test},
    'TF-IDF w/ Bigram': {'X_train': X_train_tfidf, 'y_train': y_train, 'X_test': X_test_tfidf, 'y_test': y_test},
    'Bert': {'X_train': X_train_bert, 'y_train': y_train, 'X_test': X_test_bert, 'y_test': y_test}
}

In [101]:
models_to_evaluate = {
    'LSTM': LSTM(),
    'CNN': CNN(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'SVM': SVC(),
}

In [108]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

def embeddings_evaluation(all_embeddings, model, model_name):
    score_df = pd.DataFrame()
    for name, embedding in all_embeddings.items():
        try:
            X_train = embedding.get('X_train')
            y_train = embedding.get('y_train')
            X_test = embedding.get('X_test')
            y_test = embedding.get('y_test')
            model.fit(X_train, y_train)
            y_pred_probs = model.predict(X_test)
            y_pred = np.argmax(y_pred_probs, axis=1)
            if len(y_test) == 0:
                acc, f1, precision, recall = 0, 0, 0, 0
            else:
                acc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, average='weighted')
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
        except ValueError:
            X_train = embedding.get('X_train')
            y_train = embedding.get('y_train')
            X_test = embedding.get('X_test')
            y_test = embedding.get('y_test')
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            if len(y_test) == 0:
                acc, f1, precision, recall = 0, 0, 0, 0
            else:
                acc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, average='weighted')
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
        new_row = {
            'Model': model_name,
            'Embedding': name, 
            'Accuracy': acc, 
            'Precision': precision,
            'Recall': recall, 
            'F1-score': f1,
        }

        score_df = pd.concat([score_df, pd.DataFrame(new_row, index=[0])], ignore_index=True)
    return score_df


In [109]:
evaluation_result = pd.DataFrame()
evaluation_result_dic = {}

for model_name, model in models_to_evaluate.items():
  scores = embeddings_evaluation(embedding_dict, model, model_name)
  evaluation_result = pd.concat([evaluation_result, scores])
  evaluation_result_dic[model_name] = scores



  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))




In [110]:
evaluation_result

Unnamed: 0,Model,Embedding,Accuracy,Precision,Recall,F1-score
0,LSTM,Word2Vec: Skip-Gram,0.349186,0.232504,0.349186,0.275927
1,LSTM,Word2Vec: CBOW,0.374218,0.380117,0.374218,0.355789
2,LSTM,Doc2Vec: DBOW,0.322904,0.312784,0.322904,0.217106
3,LSTM,Doc2Vec: DM,0.342929,0.1176,0.342929,0.17514
4,LSTM,Doc2Vec: DBOW+DM,0.32791,0.192975,0.32791,0.16394
5,LSTM,TF-IDF w/ Bigram,0.330413,0.109173,0.330413,0.164119
6,LSTM,Bert,0.326658,0.106706,0.326658,0.160864
0,CNN,Word2Vec: Skip-Gram,0.354193,0.247812,0.354193,0.261028
1,CNN,Word2Vec: CBOW,0.380476,0.384003,0.380476,0.373086
2,CNN,Doc2Vec: DBOW,0.372966,0.384626,0.372966,0.317516
