# DATA EXPLORATION AND WORD VECTORIZATION

In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
# Importing Basic Packages
import numpy as np
import pandas as pd

# Language Processing
import word2vec as w2v
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import nltk
import string
import text_unidecode as unidecode
from nltk.stem import WordNetLemmatizer 

# Environment Variables
import json

# LOADING DATA

In [8]:
df = pd.read_json('../raw_data/sarcasm_headlines_v2.json', lines=True)

## BASIC CLEANING

In [9]:
df.drop(columns='article_link', inplace=True)

In [10]:
df.isna().sum()

is_sarcastic    0
headline        0
dtype: int64

In [11]:
sentence = df.headline[67]

In [12]:
# Downloading nltk content
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/louis_gokelaere/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/louis_gokelaere/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/louis_gokelaere/nltk_data...


True

In [13]:
def clean_text(sentence):
    # Return a sentenced cleaned and ready to be vectorized

    # Lowercase the sentence
    sentence = sentence.lower()

    # Removes digit
    sentence = ''.join(char for char in sentence if not char.isdigit())

    # Removes punctuation and symbols
    for punct in string.punctuation:
        sentence = sentence.replace(punct, '')

    # Strip white spaces at the beginning and end of sentence
    sentence = sentence.strip()
    
    # Tokenizing
    sentence = word_tokenize(sentence)
    
    # Removing stopwords - probably not doing it for sentiment analysis
    # stop_words = set(stopwords.words('english'))
    # sentence = [word for word in sentence if not word in stop_words]

    # Lemmatize verbs and nouns
    lemmatizer = WordNetLemmatizer()
    sentence = [lemmatizer.lemmatize(word, pos='v') for word in sentence]
    sentence = [lemmatizer.lemmatize(word, pos='n') for word in sentence]

    # Returning sentence as a string
    cleaned_sentence = ' '.join(sentence)
    return cleaned_sentence

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
df['cleaned_headline'] = df['headline'].apply(clean_text)

In [16]:
df.head()

Unnamed: 0,is_sarcastic,headline,cleaned_headline
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientist unveil doomsday cloc...
1,0,dem rep. totally nails why congress is falling...,dem rep totally nail why congress be fall shor...
2,0,eat your veggies: 9 deliciously different recipes,eat your veggie deliciously different recipe
3,1,inclement weather prevents liar from getting t...,inclement weather prevent liar from get to work
4,1,mother comes pretty close to using word 'strea...,mother come pretty close to use word stream co...


In [58]:
tfid_vectorizer = TfidfVectorizer(max_df = 0.75, max_features = 7000, ngram_range=(1,2))

In [59]:
weighted_words = pd.DataFrame(tfid_vectorizer.fit_transform(df['cleaned_headline']).toarray(), 
                              columns=tfid_vectorizer.get_feature_names_out())

In [60]:
weighted_words.shape

(28619, 7000)

In [61]:
X = weighted_words
y = df['is_sarcastic']

In [62]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.3)

In [64]:
log_reg = LogisticRegression(n_jobs=-1)

cv_result = cross_validate(log_reg, X, y, cv=10, n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   40.8s finished


In [65]:
cv_result

{'fit_time': array([14.4591229 , 21.1808455 , 13.39034772, 18.81727743, 13.49623251,
        13.7804172 , 13.58053923, 14.23547029, 11.60020208, 10.99576235]),
 'score_time': array([0.13377023, 0.14270449, 0.13839531, 0.13934922, 0.13610959,
        0.13668537, 0.14140248, 0.13696408, 0.05854416, 0.05653048]),
 'test_score': array([0.83403215, 0.83647799, 0.83542977, 0.83822502, 0.84067086,
        0.84626136, 0.85255066, 0.84661076, 0.83892383, 0.84061517])}

In [66]:
cv_result['test_score'].mean()

np.float64(0.8409797559458253)