# GloVe Corpus

Reference: https://nlp.stanford.edu/projects/glove/

## Import Libraries

In [1]:
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

from utils.preprocess import getGloveCorpus
from utils.utils import getVecForm
from utils.models import simpleNN, simpleLSTM

import tensorflow as tf

In [2]:
PATH = './Datasets/'
DIMS = 300
PREPROCESS = 'glove'
MAXVECLEN = 30

## Read GloVe Data

In [3]:
VEC = getGloveCorpus(dims=DIMS)
VEC

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
the,0.046570,0.213135,-0.007435,-0.458496,-0.035645,0.236450,-0.288330,0.215210,-0.134888,-1.641602,...,-0.013062,-0.296875,-0.079895,0.194946,0.031555,0.285156,-0.087463,0.009064,-0.209839,0.053925
",",-0.255371,-0.257324,0.131714,-0.042694,0.218140,-0.022705,-0.178589,0.107544,0.058929,-1.385742,...,0.075989,-0.014359,-0.073792,0.221802,0.146484,0.566895,0.053314,-0.232910,-0.122253,0.354980
.,-0.125610,0.013634,0.103088,-0.101257,0.098145,0.136230,-0.107239,0.236938,0.328613,-1.678711,...,0.060150,-0.156250,-0.119507,0.234497,0.081360,0.246216,-0.152466,-0.342285,-0.022400,0.136841
of,-0.076965,-0.021210,0.212769,-0.722168,-0.139893,-0.122314,-0.175171,0.121399,-0.070862,-1.572266,...,-0.366699,-0.385986,0.302979,0.015747,0.340332,0.478516,0.068604,0.183472,-0.291748,-0.046539
to,-0.257568,-0.057129,-0.671875,-0.380859,-0.364258,-0.082153,-0.010956,-0.082031,0.460449,-1.847656,...,-0.012810,-0.597168,0.317383,-0.252686,0.543945,0.062988,-0.049805,-0.160400,0.046753,-0.070618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chanty,0.392578,-0.022507,0.304688,0.187988,0.141235,0.724121,-0.257812,-0.137329,-0.016525,0.596191,...,-0.182983,0.406738,-0.343750,-0.270508,-0.593750,0.016449,0.140747,0.463867,-0.369629,-0.287109
kronik,0.136841,-0.139038,-0.360840,0.079834,0.321533,0.263916,-0.109924,0.044434,0.083862,0.791504,...,0.036407,-0.036835,-0.348145,0.064758,-0.000577,-0.133789,0.428955,-0.023315,0.410156,-0.393066
rolonda,0.075684,-0.040497,0.183472,0.512207,-0.228516,0.838867,0.178833,-0.712891,0.326904,0.695312,...,-0.388428,0.545898,-0.035065,-0.184326,-0.197021,-0.350098,0.160645,0.218384,0.309570,0.437500
zsombor,0.814453,-0.362305,0.311768,0.813965,0.188477,-0.313721,0.827637,0.296631,-0.085510,0.476074,...,0.130859,0.106140,-0.408203,0.313477,-0.430176,0.069824,-0.207642,0.075500,0.284180,-0.175537


## Import Dataset

In [4]:
df = pd.read_csv(PATH+'Sarcasm_Headlines_Detection.csv').dropna().reset_index(drop=True)
df

Unnamed: 0,headline,is_sarcastic
0,versace store clerk sue secret black code mino...,0
1,roseanne revival catch thorny political mood w...,0
2,mom start fear son web series close thing gran...,1
3,boehner want wife listen come alternative debt...,1
4,rowling wish snape happy birthday magical way,0
...,...,...
28608,tyson hold contest let fan submit new idea tor...,1
28609,increasingly cocky bernie sander announce will...,1
28610,cash strap zuckerberg force sell million faceb...,1
28611,grocery store bar actually great little happy ...,1


### Remove excessively long texts

In [5]:
tooLong = []
for i in range(len(df['headline'])):
    if len(df['headline'][i].split()) > MAXVECLEN:
        tooLong.append(i)
for i in tooLong:
    df = df.drop(i, axis=0).reset_index(drop=True)
df

Unnamed: 0,headline,is_sarcastic
0,versace store clerk sue secret black code mino...,0
1,roseanne revival catch thorny political mood w...,0
2,mom start fear son web series close thing gran...,1
3,boehner want wife listen come alternative debt...,1
4,rowling wish snape happy birthday magical way,0
...,...,...
28607,tyson hold contest let fan submit new idea tor...,1
28608,increasingly cocky bernie sander announce will...,1
28609,cash strap zuckerberg force sell million faceb...,1
28610,grocery store bar actually great little happy ...,1


In [6]:
X_, Y_ = df['headline'], df['is_sarcastic']

## Simple NN

### Convert dataset into Word Vector

In [7]:
X, Y = getVecForm(
    X = X_,
    Y = Y_,
    vec = VEC,
    dims=DIMS,
    preprocess=PREPROCESS,
    vectype='sum',
)

In [8]:
X.shape

(28612, 300)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, Y)

### Training a Simple NN

In [10]:
snn = simpleNN(X)
snn.fit(x_train, y_train, validation_split=0.3, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1c1f2c31a60>

In [11]:
snn.evaluate(x_test, y_test)



[0.5218070149421692, 0.7561861872673035]

## Simple LSTM (sum)

### Convert dataset into Word Vector

In [12]:
X, Y = getVecForm(
    X = X_,
    Y = Y_,
    vec = VEC,
    dims=DIMS,
    preprocess=PREPROCESS,
    vectype='sum',
    reshaping=(X.shape[0], 1, X.shape[1])
)

In [13]:
X.shape

(28612, 1, 300)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X, Y)

### Training a Simple LSTM

In [15]:
lstms = simpleLSTM(X)
lstms.fit(x_train, y_train, validation_split=0.3, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c20c5567f0>

In [16]:
lstms.evaluate(x_test, y_test)



[0.5419329404830933, 0.7282259464263916]

## Simple LSTM (vector)

### Convert dataset into Word Vector

In [17]:
X, Y = getVecForm(
    X = X_,
    Y = Y_,
    vec = VEC,
    dims=DIMS,
    preprocess=PREPROCESS,
    vectype='vector',
    MaxvecLen=MAXVECLEN
)

In [18]:
X.shape

(28612, 30, 300)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(X, Y)

### Training a Simple LSTM

In [20]:
lstmv = simpleLSTM(X)
lstmv.fit(x_train, y_train, validation_split=0.3, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c1862d0820>

In [21]:
lstmv.evaluate(x_test, y_test)



[0.5272580981254578, 0.7686285376548767]