# DATA EXPLORATION AND WORD VECTORIZATION

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Importing Basic Packages
import numpy as np
import pandas as pd

# Language Processing
import word2vec as w2v
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import nltk
import string
import text_unidecode as unidecode
from nltk.stem import WordNetLemmatizer 

# Environment Variables
from dotenv import load_dotenv
import os
import json
load_dotenv()

True

# LOADING DATA

In [3]:
data = os.getenv('DATA')
data_v2 = os.getenv('DATA')

In [4]:
df = pd.read_json(data_v2, lines=True)

## BASIC CLEANING

In [5]:
df.drop(columns='article_link', inplace=True)

In [6]:
df.isna().sum()

headline        0
is_sarcastic    0
dtype: int64

In [7]:
sentence = df.headline[67]

In [8]:
# Downloading nltk content
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/louis_gokelaere/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/louis_gokelaere/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/louis_gokelaere/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [39]:
def clean_text(sentence):
    # Return a sentenced cleaned and ready to be vectorized

    # Lowercase the sentence
    sentence = sentence.lower()

    # Removes digit
    sentence = ''.join(char for char in sentence if not char.isdigit())

    # Removes punctuation and symbols
    for punct in string.punctuation:
        sentence = sentence.replace(punct, '')

    # Strip white spaces at the beginning and end of sentence
    sentence = sentence.strip()
    
    # Tokenizing
    sentence = word_tokenize(sentence)
    
    # Removing stopwords - probably not doing it for sentiment analysis
    # stop_words = set(stopwords.words('english'))
    # sentence = [word for word in sentence if not word in stop_words]

    # Lemmatize verbs and nouns
    lemmatizer = WordNetLemmatizer()
    sentence = [lemmatizer.lemmatize(word, pos='v') for word in sentence]
    sentence = [lemmatizer.lemmatize(word, pos='n') for word in sentence]

    # Returning sentence as a string
    cleaned_sentence = ' '.join(sentence)
    return cleaned_sentence

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
df['cleaned_headline'] = df['headline'].apply(clean_text)

In [42]:
df.head()

Unnamed: 0,headline,is_sarcastic,cleaned_headline
0,former versace store clerk sues over secret 'b...,0,former versace store clerk sue over secret bla...
1,the 'roseanne' revival catches up to our thorn...,0,the roseanne revival catch up to our thorny po...
2,mom starting to fear son's web series closest ...,1,mom start to fear son web series closest thing...
3,"boehner just wants wife to listen, not come up...",1,boehner just want wife to listen not come up w...
4,j.k. rowling wishes snape happy birthday in th...,0,jk rowling wish snape happy birthday in the mo...


In [None]:
tfid_vectorizer = TfidfVectorizer(max_df = 0.75, max_features = 5000, ngram_range=(1,2))

In [None]:
weighted_words = pd.DataFrame(tfid_vectorizer.fit_transform(df['cleaned_headline']).toarray(), 
                              columns=tfid_vectorizer.get_feature_names_out())

In [None]:
weighted_words.shape

(26709, 5000)