## Access kaggle dataset through generated API key

In [6]:
# !pip install kaggle
# !mkdir ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

## download kaggle dataset once api key is loaded in notebook

In [7]:
# !kaggle datasets download danofer/sarcasm

## unzip zip file if there is

In [8]:
# !unzip sarcasm.zip

## Import libraries

In [9]:
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mig\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Explore dataset

In [10]:
df = pd.read_csv('../data/reddit-sarcasm-data/train-balanced-sarcasm.csv')
df

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...
...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,TwarkMain,reddit.com,2,2,0,2009-04,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,..."
1010822,1,"whatever you do, don't vote green!",BCHarvey,climate,1,1,0,2009-05,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...
1010823,1,Perhaps this is an atheist conspiracy to make ...,rebelcommander,atheism,1,1,0,2009-01,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...
1010824,1,The Slavs got their own country - it is called...,catsi,worldnews,1,1,0,2009-01,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...


In [11]:
# Y = pd.read_csv('./test-balanced.csv')
# Y

## Split input/independent and output/dependent columns/features

In [12]:
X = df['parent_comment']
Y = df['label']
print(X.dtype)
print(Y.dtype)

object
int64


In [13]:
sample = X.loc[0]
sample = sample.encode('utf-8')
sample = sample.split()
sample

[b'Yeah,',
 b'I',
 b'get',
 b'that',
 b'argument.',
 b'At',
 b'this',
 b'point,',
 b"I'd",
 b'prefer',
 b'is',
 b'she',
 b'lived',
 b'in',
 b'NC',
 b'as',
 b'well.']

## Preprocess text
- remove trailing whitespaces
- remove non-alphanumeric characters
- lower sentences
- tokenize
- remove stop words
- lemmatize or stem word

<u>or use gensim.utils.simple_preprocess as callback of self.apply()</u>

In [14]:
import re

def view_sentence(phase, sentences, limit=5):
  for sentence in sentences.iloc[:limit]:
    print(f'{phase} phase:\n{sentence}\n')

def preprocess(comment__df) -> pd.DataFrame:
  # remove whitespaces
  temp = comment__df.apply(lambda sentence: sentence.strip())
  view_sentence('whitespace removal', temp)

  # match all non-alphanumeric characters 
  # not being used by words then remove 
  # e.g. you're uses ' so do not remove '
  temp = temp.apply(lambda sentence: re.sub(r'\b\w*([^\w\s]|_)\w*\b|([^\w\s]|_)', lambda match: match.group(0) if match.group(1) else '', sentence))
  view_sentence('non-alphanumeric char except in words removal', temp, limit=20)

  # turn sentences to lowercase
  temp = temp.apply(lambda sentence: sentence.lower())
  view_sentence('to lowercase',temp)

  # tokenize sentences and encode as well in unicode format
  temp = temp.apply(lambda sentence: sentence.split(' '))
  view_sentence('tokenization', temp)

  # remove stop words
  stop_words = stopwords.words('english')
  print(stop_words)
  temp = temp.apply(lambda words: [word for word in words if not word in stop_words])
  view_sentence('stop word removal', temp)

  # lemmatize or stem words/tokens in each row
  # ps = PorterStemmer()
  wordnet = WordNetLemmatizer()
  temp = temp.apply(lambda words: [wordnet.lemmatize(word) for word in words])
  view_sentence('lemmatization', temp)

  # encode to utf-8
  temp = temp.apply(lambda words: [word.encode('utf-8') for word in words])


  return temp

In [15]:
X = preprocess(X)

whitespace removal phase:
Yeah, I get that argument. At this point, I'd prefer is she lived in NC as well.

whitespace removal phase:
The blazers and Mavericks (The wests 5 and 6 seed) did not even carry a good enough record to make the playoffs in the east last year.

whitespace removal phase:
They're favored to win.

whitespace removal phase:
deadass don't kill my buzz

whitespace removal phase:
Yep can confirm I saw the tool they use for that. It was made by our boy EASports_MUT

non-alphanumeric char except in words removal phase:
Yeah I get that argument At this point I'd prefer is she lived in NC as well

non-alphanumeric char except in words removal phase:
The blazers and Mavericks The wests 5 and 6 seed did not even carry a good enough record to make the playoffs in the east last year

non-alphanumeric char except in words removal phase:
They're favored to win

non-alphanumeric char except in words removal phase:
deadass don't kill my buzz

non-alphanumeric char except in words

In [43]:
X.shape

(1010826,)

## model architecture and initialization

here the window size or the amount of words to use as context and target are indicated, as well as the min_count which indicates if a word length lower than its value is still to be considered part of the window, and workers which represent the number of threads to use in training the model

In [28]:
model = gensim.models.Word2Vec(window=10, min_count=2, workers=4)


### building the vocabulary

In [29]:
# progress_per arg is the number of words 
# to proecss before a status update is given
model.build_vocab(X, progress_per=1000)

In [30]:
# see also corpus count and epochs
print(model.corpus_count)
print(model.epochs)
# print(model.vocab)

1010826
5


### training the model

In [31]:
model.train(X, total_examples=model.corpus_count, epochs=30)

(377414408, 397167900)

In [34]:
vocab, vectors = model.wv.key_to_index, model.wv.vectors

In [35]:
vocab

{b'like': 0,
 b'': 1,
 b'people': 2,
 b'get': 3,
 b'would': 4,
 b'one': 5,
 b"i'm": 6,
 b'think': 7,
 b'game': 8,
 b'time': 9,
 b'know': 10,
 b'make': 11,
 b'really': 12,
 b'thing': 13,
 b'want': 14,
 b'even': 15,
 b'year': 16,
 b'good': 17,
 b'go': 18,
 b'say': 19,
 b'guy': 20,
 b'see': 21,
 b'could': 22,
 b'way': 23,
 b'right': 24,
 b'much': 25,
 b'need': 26,
 b'going': 27,
 b'new': 28,
 b'still': 29,
 b"that's": 30,
 b'also': 31,
 b'got': 32,
 b'well': 33,
 b'u': 34,
 b'something': 35,
 b'day': 36,
 b"can't": 37,
 b'first': 38,
 b'back': 39,
 b'look': 40,
 b'never': 41,
 b'work': 42,
 b'fuck': 43,
 b'take': 44,
 b"i've": 45,
 b'someone': 46,
 b'play': 47,
 b'actually': 48,
 b'every': 49,
 b'team': 50,
 b'mean': 51,
 b'better': 52,
 b'2': 53,
 b'lot': 54,
 b'shit': 55,
 b'use': 56,
 b'sure': 57,
 b'pretty': 58,
 b'best': 59,
 b'said': 60,
 b'feel': 61,
 b'come': 62,
 b'point': 63,
 b'anyone': 64,
 b"he's": 65,
 b'player': 66,
 b'many': 67,
 b'woman': 68,
 b'last': 69,
 b'bad': 70,
 b

In [36]:
vectors

array([[ 0.1305678 , -1.4795077 , -1.291518  , ..., -0.91057694,
        -0.4490916 , -1.4907101 ],
       [-2.1058288 ,  0.54606575,  1.6236777 , ...,  0.94598925,
        -0.35143653,  1.077399  ],
       [ 1.8619215 , -2.598588  , -3.520356  , ..., -2.2271786 ,
        -1.7678174 , -1.2654538 ],
       ...,
       [-0.14822592,  0.21522756,  0.34172982, ..., -0.20467222,
         0.22629166,  0.17647523],
       [ 0.06941286,  0.3047846 , -0.08562076, ...,  0.03844226,
        -0.30563766,  0.10357574],
       [ 0.19999166, -0.01482904,  0.2361833 , ..., -0.0603235 ,
        -0.07554152,  0.00681239]], dtype=float32)

In [37]:
len(vocab)

122279

In [38]:
vectors.shape

(122279, 100)

In [39]:
word_vec = dict(zip(vocab.keys(), vectors))

In [40]:
word_vec

{b'like': array([ 0.1305678 , -1.4795077 , -1.291518  ,  1.6021726 , -0.07052449,
        -0.6800591 ,  1.1354449 , -1.7840444 , -0.12750578, -0.89600044,
         2.8071449 ,  1.020868  , -1.3665366 , -1.5652127 ,  0.8435186 ,
        -0.94978887,  0.9320372 ,  3.8791893 ,  0.02904781, -1.5906495 ,
         0.4817242 , -0.34361956,  0.39970955,  0.9616012 ,  0.5195217 ,
        -2.385638  , -0.8076533 ,  0.36176324, -0.8143242 , -0.4752503 ,
        -1.6284542 ,  1.2153149 ,  1.0676671 ,  0.51977897,  3.2445948 ,
        -0.19675052,  3.5195057 , -1.6882236 ,  3.9437032 , -0.21116677,
        -1.9568038 ,  0.6234452 , -0.41429305, -0.65101177, -2.684504  ,
         0.09009135, -0.8219574 ,  1.873883  , -0.91190696,  0.5239097 ,
        -1.0164098 ,  0.01596094,  2.5076952 , -2.1543229 ,  3.5054915 ,
         0.7367361 ,  2.374128  ,  0.70493937, -1.0386759 ,  2.8043497 ,
        -0.8172049 , -0.64914566,  0.7478308 , -0.6790006 ,  2.0965352 ,
         1.5845705 , -2.2523556 , -1.96718

In [41]:
len(word_vec)

122279

In [44]:
X.dtype

dtype('O')

In [45]:
X.to_csv('../data/reddit-sarcasm-data-cleaned.csv')