## **CS3244 Project - Models**

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV

In [2]:
# General function for running any model
def run(X_train, y_train, X_test, y_test, model):
  model.fit(X_train, y_train)
  X_train_predict = model.predict(X_train)
  X_test_predict = model.predict(X_test)
  train_score_roc = roc_auc_score(y_train, X_train_predict)
  test_score_roc = roc_auc_score(y_test, X_test_predict)
  train_score = precision_recall_fscore_support(y_train, X_train_predict, average='micro')
  test_score = precision_recall_fscore_support(y_test, X_test_predict, average='micro')
  print(f"Training performance (ROC-AUC): {train_score_roc}")
  print(f"Test performance (ROC-AUC): {test_score_roc}")
  print(f"Training performance (Precision - Recall - F1): {train_score}")
  print(f"Test performance (Precision - Recall - F1): {test_score}")

In [3]:
# Display best hyperparameters for a given model
def tune(X_train, y_train, X_test, y_test, model, hyperparameters):
  clf = GridSearchCV(model, hyperparameters)
  clf.fit(X_train, y_train)
  print("Best hyperparameters: " + str(clf.best_params_))

  # Run the model with the best params
  model.set_params(**clf.best_params_)
  run(X_train, y_train, X_test, y_test, model)

## Import Data

In [None]:
# Run this only when you are using Google Colab
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download danofer/sarcasm
! unzip sarcasm
! kaggle datasets download chenyiyang/reddit-sarcasm-extracted-features-dataset
! unzip reddit-sarcasm-extracted-features-dataset

Downloading sarcasm.zip to /content
 99% 214M/216M [00:01<00:00, 150MB/s]
100% 216M/216M [00:01<00:00, 149MB/s]
Archive:  sarcasm.zip
  inflating: test-balanced.csv       
  inflating: test-unbalanced.csv     
  inflating: train-balanced-sarc.csv.gz  
  inflating: train-balanced-sarcasm.csv  
Downloading reddit-sarcasm-extracted-features-dataset.zip to /content
100% 14.2M/14.2M [00:00<00:00, 45.4MB/s]

Archive:  reddit-sarcasm-extracted-features-dataset.zip
  inflating: X-test-v1.0.csv         
  inflating: X-train-v1.0.csv        
  inflating: y-test-v1.0.csv         
  inflating: y-train-v1.0.csv        


In [None]:
# Run if you are using this notebook offline
# ! /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install pandas
# ! /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install sklearn
# ! /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install keras
# ! /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install tensorflow

In [4]:
# Read data
df = pd.read_csv('train-balanced-sarcasm.csv')
df['comment'].replace('', np.nan, inplace=True)
df.dropna(subset=['comment'], inplace=True)
X, y = df['comment'].values, df['label'].values

# Split into train (size 808618) and test (size 202155)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Generate Input Vectors

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

def to_count(X_train, X_test):
  vectorizer = CountVectorizer()
  return vectorizer.fit_transform(X_train), vectorizer.transform(X_test)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif

MIN_DF = 2
TOP_K = 1000000
BEST_NGRAM_RANGE = (1, 3)

def to_tfidf(X_train, X_test, ngram_range):
  vectorizer = TfidfVectorizer(ngram_range=ngram_range, min_df=MIN_DF, 
    dtype=np.float64, strip_accents='unicode', decode_error='replace')
  X_train_tfidf = vectorizer.fit_transform(X_train)
  X_test_tfidf = vectorizer.transform(X_test)

  # Set a limit on the number of features in a vector
  selector = SelectKBest(f_classif, k=min(TOP_K, X_train_tfidf.shape[1]))
  selector.fit(X_train_tfidf, y_train)
  X_train_tfidf = selector.transform(X_train_tfidf).astype('float32')
  X_test_tfidf = selector.transform(X_test_tfidf).astype('float32')
  return X_train_tfidf, X_test_tfidf

In [7]:
# vocab_size = 159401
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def to_keras_embed(X_train, X_test):
  tokenizer = Tokenizer(num_words=160000)
  tokenizer.fit_on_texts(X_train)
  X_train_embed = tokenizer.texts_to_sequences(X_train)
  X_test_embed = tokenizer.texts_to_sequences(X_test)
  vocab_size = len(tokenizer.word_index) + 1

  # Pad with zeros so each embedding vector is equal length
  X_train_embed = pad_sequences(X_train_embed, padding='post', maxlen=200)
  X_test_embed = pad_sequences(X_test_embed, padding='post', maxlen=200)
  return X_train_embed, X_test_embed

In [8]:
import nltk
nltk.download('punkt')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

def to_dtv_embed(X_train, X_test):
  # Implemented based on https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5
  tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(X_train)]
  model = Doc2Vec(size=250, alpha=0.025, min_alpha=0.00025, min_count=1, dm=1)
  model.build_vocab(tagged_data)
  model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)
  model.alpha -= 0.0002
  model.min_alpha = model.alpha
  X_train_dtv = [model.infer_vector(word_tokenize(comment.lower())) for comment in X_train]
  X_test_dtv = [model.infer_vector(word_tokenize(comment.lower())) for comment in X_test]
  return X_train_dtv, X_test_dtv

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
X_train_counts, X_test_counts = to_count(X_train, X_test)
X_train_tfidf, X_test_tfidf = to_tfidf(X_train, X_test, BEST_NGRAM_RANGE)
X_train_keras, X_test_keras = to_keras_embed(X_train, X_test)
X_train_features, X_test_features = np.genfromtxt("X-train-v1.0.csv", delimiter=','), np.genfromtxt("X-test-v1.0.csv", delimiter=',')
# X_train_dtv, X_test_dtv = to_dtv_embed(X_train, X_test)

## **Convolutional Neural Network (CNN)**

We implement a Convolutional Neural Network (CNN) model with reference to [this article](https://analyticsindiamag.com/guide-to-text-classification-using-textcnn/).

In [9]:
# Create CNN model
from keras.models import Sequential
from keras import layers

# Evaluation

In [10]:
# Get a CNN model
def get_cnn_model(epoch, vocab_size, embedding_dim, maxlen):
  textcnnmodel = Sequential()
  textcnnmodel.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
  textcnnmodel.add(layers.Conv1D(256, 5, activation='relu'))
  textcnnmodel.add(layers.GlobalMaxPooling1D())
  textcnnmodel.add(layers.Dense(10, activation='relu'))
  textcnnmodel.add(layers.Dense(1, activation='sigmoid'))
  textcnnmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  textcnnmodel.summary()
  return textcnnmodel

In [11]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec, KeyedVectors

# creating corpus
corpus_text = 'n'.join(df['comment'])
data = []
# iterate through each sentence in the file
for i in sent_tokenize(corpus_text):
    temp = []
    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())
    data.append(temp)

# Building word2vec model using Gensim
#CBOW
# model1 = gensim.models.Word2Vec(data, min_count=1, size=100, window=5, sg=0)
#skip-gram
# model2 = gensim.models.Word2Vec(data, min_count=1, size=100, window=5, sg=1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Can consider using context + doc2vec as stated in article:
https://medium.com/@namanjain2050/using-deep-learning-to-identify-sarcasm-100a4a4ceaea

In [12]:
# getting Google pretrained W2V
!pip install wget==3.2
from keras.models import load_model

import os
import wget
import gzip
import shutil

gn_vec_path = "GoogleNews-vectors-negative300.bin"
if not os.path.exists("GoogleNews-vectors-negative300.bin"):
    if not os.path.exists("../Ch2/GoogleNews-vectors-negative300.bin"):
        #Downloading the reqired model
        if not os.path.exists("../Ch2/GoogleNews-vectors-negative300.bin.gz"):
            if not os.path.exists("GoogleNews-vectors-negative300.bin.gz"):
                wget.download("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz")
            gn_vec_zip_path = "GoogleNews-vectors-negative300.bin.gz"
        else:
            gn_vec_zip_path = "../Ch2/GoogleNews-vectors-negative300.bin.gz"
        #Extracting the required model
        with gzip.open(gn_vec_zip_path, 'rb') as f_in:
            with open(gn_vec_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    else:
        gn_vec_path = "../Ch2/" + gn_vec_path

print(f"Model at {gn_vec_path}")

Model at GoogleNews-vectors-negative300.bin


In [13]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

import gensim
from gensim.models import Word2Vec, KeyedVectors

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

train_bal = df.sample(frac=0.8)
remaining = df.drop(train_bal.index)
cv_bal = remaining.sample(frac=0.5)
test_bal = remaining.drop(cv_bal.index)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_bal['comment'].values)

encoded_comments_train = tokenizer.texts_to_sequences(train_bal['comment'])
encoded_comments_cv = tokenizer.texts_to_sequences(cv_bal['comment'])
encoded_comments_test = tokenizer.texts_to_sequences(test_bal['comment'])

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

padded_comments_train = pad_sequences(encoded_comments_train, maxlen=200, padding='post')
padded_comments_cv = pad_sequences(encoded_comments_cv, maxlen=200, padding='post')
padded_comments_test = pad_sequences(encoded_comments_test, maxlen=200, padding='post')

y_train = train_bal['label'].values
y_cv = cv_bal['label'].values
y_test = test_bal['label'].values

y_train = to_categorical(y_train, num_classes=2)
y_cv = to_categorical(y_cv, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
159478


In [14]:
#loading our W2V pre-trained vectors
w2v_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# create a weight matrix for words in dictionary
embedding_matrix_w2v = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = w2v_model[word]
    except:
        embedding_vector = [0]*300
    
    if embedding_vector is not None:
        embedding_matrix_w2v[i] = embedding_vector

In [23]:
from keras import Input, layers, Model, callbacks, optimizers
import tensorflow as tf
from keras import backend as K

# model definition - this model is purely based on content

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
        precision = precision_m(y_true, y_pred)
        recall = recall_m(y_true, y_pred)
        return 2*((precision*recall)/(precision+recall+K.epsilon()))

input_data = Input(shape=(200,), name='main_input')
embedding_layer = layers.Embedding(vocab_size, 300, weights=[embedding_matrix_w2v])(input_data)
conv_1 = layers.Conv1D(filters=128, kernel_size=4, activation='relu')(embedding_layer)
max_1 = layers.MaxPooling1D(pool_size=2)(conv_1)
conv_2 = layers.Conv1D(filters=64, kernel_size=3, activation='relu')(max_1)
max_2 = layers.MaxPooling1D(pool_size=2)(conv_2)
flatten = layers.Flatten()(max_2)
dense = layers.Dense(100, activation='relu', name='fully_connected')(flatten)
out = layers.Dense(2, activation='softmax')(dense)

model_01 = Model(inputs=[input_data], outputs=[out])

print(model_01.summary())

#defining checkpoints
tensorboard = callbacks.TensorBoard(log_dir='model_01')

reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_f1_m', 
                              mode = 'max', 
                              factor=0.5, 
                              patience=5, 
                              min_lr=0.0001, 
                              verbose=10)

checkpoint = callbacks.ModelCheckpoint("model_01.h5", 
                               monitor="val_f1_m", 
                               mode="max", 
                               save_best_only = True, 
                               verbose=1)

earlystop = callbacks.EarlyStopping(monitor = 'val_f1_m', 
                            mode="max", 
                            min_delta = 0, 
                            patience = 2,
                            verbose=1)

#compiling model
c = tf.keras.optimizers.Adam(learning_rate = 0.0001)
model_01.compile(optimizer=c, loss='categorical_crossentropy', metrics=['acc', f1_m])

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 main_input (InputLayer)     [(None, 200)]             0         
                                                                 
 embedding_4 (Embedding)     (None, 200, 300)          47843400  
                                                                 
 conv1d_8 (Conv1D)           (None, 197, 128)          153728    
                                                                 
 max_pooling1d_8 (MaxPooling  (None, 98, 128)          0         
 1D)                                                             
                                                                 
 conv1d_9 (Conv1D)           (None, 96, 64)            24640     
                                                                 
 max_pooling1d_9 (MaxPooling  (None, 48, 64)           0         
 1D)                                                       

In [27]:
#training
h1 = model_01.fit(padded_comments_train, y_train, 
               batch_size=64,
               epochs=10, 
               verbose=1, callbacks=[tensorboard, checkpoint, earlystop, reduce_lr], 
               validation_data=(padded_comments_cv, y_cv))

Epoch 1/10
Epoch 00001: val_f1_m did not improve from 0.72577
Epoch 2/10
Epoch 00002: val_f1_m did not improve from 0.72577
Epoch 3/10
Epoch 00003: val_f1_m did not improve from 0.72577
Epoch 00003: early stopping


In [28]:
from sklearn.metrics import classification_report, confusion_matrix

score_1 = model_01.evaluate(padded_comments_test, y_test, verbose=True)
print(score_1)

cnf_mat = confusion_matrix(np.argmax(y_test, axis=1), np.argmax(model_01.predict(padded_comments_test), axis=1))

print(cnf_mat)
# sns.heatmap(cnf_mat, annot=True, fmt='g', linewidths=.5, xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
            
# plt.plot(h1.history['f1_m'][1:])
# plt.plot(h1.history['val_f1_m'][1:])
# plt.title('model iou metric')
# plt.ylabel('F1 metric')
# plt.xlabel('epoch')
# plt.legend(['train','Validation'], loc='upper left')
# plt.show()

[0.7139533162117004, 0.6979530453681946, 0.6979132890701294]
[[34411 16293]
 [14237 36136]]
