# Sentiment Analysis : EDA and model testing

Our project aims at deploying a sentiment prediction API

In this notebook we will first conduct our EDA and then evaluate different approaches

In [111]:
# classic Librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
import shutil

# NLTK imports
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# sklearn import 
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn import cluster
from sklearn.model_selection import train_test_split

# librairies for NLP
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.preprocessing.text import Tokenizer

import gensim

# Bert
import transformers
from transformers import TFAutoModel
from transformers import AutoTokenizer

In [112]:
#DATA DOC FROM DATASOURCE (Kaggle)
"""

    target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
    ids: The id of the tweet ( 2087)
    date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
    flag: The query (lyx). If there is no query, then this value is NO_QUERY.
    user: the user that tweeted (robotickilldozr)
    text: the text of the tweet (Lyx is cool)

"""
print()#used to not print documentation. Not pretty but effective




In [113]:
#have to specify header=None because columns names are not implemented correctly
columns_names = ["target","id","date","flag","user","text"]
data = pd.read_csv("data/training.1600000.processed.noemoticon.csv", header=None, encoding='latin-1',names=columns_names)
data.shape

(1600000, 6)

In [114]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [115]:
data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


Dataset seems clean \
Moving on to to cleaning of sentences

## Text pre_treatment

In [118]:
# To understand what we are dealing with, let's see the first description
desc = data.text[0]
desc

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

The very first is an interessting case. It is an answer to a tweet including a picture. \
This should be ignored when analyzing tweet \
NOTE TO SELF : prepare a RegEx to ignore or clean out images

In [120]:
# define a RegExp to tokenize only alphanumerics
tokenizer = nltk.RegexpTokenizer(r'\w+')
list_of_words = tokenizer.tokenize(desc)
list_of_words

This will not exclude images

In [122]:
# lower all characters
lowered_list = [w.lower() for w in list_of_words]
lowered_list
#len(lowered_list)

Do we really want to lower_case all characters ? Emphasis can be derived by full caps comments, maybe it should be kept

In [124]:
# stopwords exploration
len(nltk.corpus.stopwords.words('english'))

# stopwords suppression
def stop_word_filter(list_words) :
    stop_w = nltk.corpus.stopwords.words('english')
    filtered_w = [w for w in list_words if not w in stop_w]
    filtered_w2 = [w for w in filtered_w if len(w) > 2]
    return filtered_w2

filtered_list = stop_word_filter(lowered_list)
filtered_list
#len(filtered_list)

We will use lemmatization here \
Tweet are already quite short, and stemming them would be faster but would lose us precious information

In [126]:
def lemmat(list_words) :
    lemmatizer = WordNetLemmatizer()
    lem_w = [lemmatizer.lemmatize(w) for w in list_words]
    return lem_w

lemmat_list = lemmat(filtered_list)
#lemmat_list
len(lemmat_list)

In [127]:
transf_text = ' '.join(lemmat_list)
transf_text

'switchfoot http twitpic com 2y1zl awww bummer shoulda got david carr third day'

In [130]:
# Define a treatment function that will be usable on the whole dataset

def cleaning_description_lemmat(text) :
    # Tools used reminder :
    # tokenizer = nltk.RegexpTokenizer(r'\w+')
    # stop_w = nltk.corpus.stopwords.words('english')
    # lemmatizer = WordNetLemmatizer()
    # stem = PorterStemmer()
    
    list_of_words = tokenizer.tokenize(text)
    lowered_list = [w.lower() for w in list_of_words]
    filtered_list = stop_word_filter(lowered_list)
    lemmat_list = lemmat(filtered_list)
    transformed_text = ' '.join(lemmat_list)

    return transformed_text

In [144]:
text = data["text"][1599999]
test = cleaning_description_lemmat(text)
test

'happy charitytuesday thenspcc sparkscharity speakinguph4h'

In [148]:
from joblib import Parallel, delayed

# Function to process a chunk of data
def process_chunk(chunk):
    chunk['cleaned_text'] = chunk['text'].apply(cleaning_description_lemmat)
    return chunk

# Break your data into smaller chunks and process them in parallel
data_chunks = np.array_split(data, 10)  # Split data into 10 chunks

# Use parallel processing to clean the text
results = Parallel(n_jobs=-1)(delayed(process_chunk)(chunk) for chunk in data_chunks)

# Combine the results back into a single dataframe
cleaned_data = pd.concat(results)
#took 321 seconds

  return bound(*args, **kwds)


In [None]:
#data['cleaned_text'] = data['text'].apply(lambda x : cleaning_description_lemmat(x))

In [150]:
cleaned_data.head()

Unnamed: 0,target,id,date,flag,user,text,cleaned_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com 2y1zl awww bummer ...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dived many time ball managed save res...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving mad see


## Bag Of Word

In [152]:
# How important is the bag of word ?

#x = data['cleaned_text'].apply(lambda x : len(word_tokenize(x)))
#print("max length bow : ", x)

In [153]:
# create bag of words Tf-idf

ctf = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=1)

ctf_fit = ctf.fit(cleaned_data['cleaned_text'])
 
ctf_transform = ctf.transform(cleaned_data['cleaned_text'])  

In [154]:
ctf_transform

<1600000x670610 sparse matrix of type '<class 'numpy.float64'>'
	with 10272535 stored elements in Compressed Sparse Row format>

In [None]:
# PCA to reduce dimensions
pca = PCA(svd_solver='arpack')
pca.fit(ctf_transform)

In [None]:
tsne = TSNE(init='random')

TSNE_ctf_transform_2 = tsne.fit_transform(ctf_transform)
TSNE_ctf_transform_2.shape

In [None]:
sentiment = cleaned_data['target']

df_real_sentiment = pd.DataFrame({
    'tsne_1': TSNE_ctf_transform_2[:, 0],
    'tsne_2': TSNE_ctf_transform_2[:, 1],
    'sentiment': sentiment
})

plt.figure(figsize=(10, 8))
sns.scatterplot(
    x='tsne_1', y='tsne_2', 
    hue='sentiment',
    palette=sns.color_palette('hls', len(df_real_sentiment['sentiment'].unique())),
    data=df_real_categories,
    legend='full',
    alpha=0.8
)
plt.title('2D t-SNE Representation Colored by General Category')
plt.show()