# Word2Vec Corpus

Reference: https://nlp.stanford.edu/projects/glove/

## Import Libraries

In [1]:
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

from utils.preprocess import getWord2VecCorpus
from utils.utils import getVecForm
from utils.models import simpleNN, simpleLSTM

import tensorflow as tf

In [2]:
PATH = './Datasets/'
DIMS = 300
PREPROCESS = 'word2vec'
MAXVECLEN = 30

## Read GloVe Data

In [3]:
VEC = getWord2VecCorpus()
VEC

<gensim.models.keyedvectors.KeyedVectors at 0x1f972b11c40>

## Import Dataset

In [4]:
df = pd.read_csv(PATH+'Sarcasm_Headlines_Detection.csv').dropna().reset_index(drop=True)
df

Unnamed: 0,headline,is_sarcastic
0,versace store clerk sue secret black code mino...,0
1,roseanne revival catch thorny political mood w...,0
2,mom start fear son web series close thing gran...,1
3,boehner want wife listen come alternative debt...,1
4,rowling wish snape happy birthday magical way,0
...,...,...
28608,tyson hold contest let fan submit new idea tor...,1
28609,increasingly cocky bernie sander announce will...,1
28610,cash strap zuckerberg force sell million faceb...,1
28611,grocery store bar actually great little happy ...,1


### Remove excessively long texts

In [5]:
tooLong = []
for i in range(len(df['headline'])):
    if len(df['headline'][i].split()) > MAXVECLEN:
        tooLong.append(i)
for i in tooLong:
    df = df.drop(i, axis=0).reset_index(drop=True)
df

Unnamed: 0,headline,is_sarcastic
0,versace store clerk sue secret black code mino...,0
1,roseanne revival catch thorny political mood w...,0
2,mom start fear son web series close thing gran...,1
3,boehner want wife listen come alternative debt...,1
4,rowling wish snape happy birthday magical way,0
...,...,...
28607,tyson hold contest let fan submit new idea tor...,1
28608,increasingly cocky bernie sander announce will...,1
28609,cash strap zuckerberg force sell million faceb...,1
28610,grocery store bar actually great little happy ...,1


In [6]:
X_, Y_ = df['headline'], df['is_sarcastic']

## Simple NN

### Convert dataset into Word Vector

In [7]:
X, Y = getVecForm(
    X = X_,
    Y = Y_,
    vec = VEC,
    dims=DIMS,
    preprocess=PREPROCESS,
    vectype='sum',
)

In [8]:
X.shape

(28612, 300)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, Y)

### Training a Simple NN

In [10]:
snn = simpleNN(X)
snn.fit(x_train, y_train, validation_split=0.3, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1f9123425b0>

In [11]:
snn.evaluate(x_test, y_test)



[0.5238915681838989, 0.7476583123207092]

## Simple LSTM (sum)

### Convert dataset into Word Vector

In [12]:
X, Y = getVecForm(
    X = X_,
    Y = Y_,
    vec = VEC,
    dims=DIMS,
    preprocess=PREPROCESS,
    vectype='sum',
    reshaping=(X.shape[0], 1, X.shape[1])
)

In [13]:
X.shape

(28612, 1, 300)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X, Y)

### Training a Simple LSTM

In [15]:
lstms = simpleLSTM(X)
lstms.fit(x_train, y_train, validation_split=0.3, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1fa64527d00>

In [16]:
lstms.evaluate(x_test, y_test)



[0.5010948777198792, 0.7560464143753052]

## Simple LSTM (vector)

### Convert dataset into Word Vector

In [17]:
X, Y = getVecForm(
    X = X_,
    Y = Y_,
    vec = VEC,
    dims=DIMS,
    preprocess=PREPROCESS,
    vectype='vector',
    MaxvecLen=MAXVECLEN
)

In [18]:
X.shape

(28612, 30, 300)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(X, Y)

### Training a Simple LSTM

In [20]:
lstmv = simpleLSTM(X)
lstmv.fit(x_train, y_train, validation_split=0.3, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1fa7de80b20>

In [21]:
lstmv.evaluate(x_test, y_test)



[0.4732193350791931, 0.7807912826538086]