# Part 1

# Loading Dataset

In [1]:
# Importing
import pandas as pd
# pd.set_option('display.max_columns', None)

# Loading Dataset
df = pd.read_csv(r'https://github.com/AlvaroMatsuda/Sentiment_Analysis/blob/main/data/reviews.csv?raw=true')
df

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",5,2,
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,5,1,
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",4,0,
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1,1,
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,1,1,
...,...,...,...,...,...
61589,2022-01-01 03:01:29,Even though it was communicated that lyrics fe...,1,6,
61590,2022-01-01 02:13:40,"Use to be sooo good back when I had it, and wh...",1,0,
61591,2022-01-01 01:02:29,This app would be good if not for it taking ov...,2,10,
61592,2022-01-01 00:49:23,The app is good hard to navigate and won't jus...,2,1,


# Tokenization

In [2]:
# Extracting only one review
sentence = df['Review'][0]
sentence

'Great music service, the audio is high quality and the app is easy to use. Also very quick and friendly support.'

In [3]:
# Importing
from nltk.tokenize import word_tokenize

# Tokenizing a single review
tokens = word_tokenize(sentence)
tokens

['Great',
 'music',
 'service',
 ',',
 'the',
 'audio',
 'is',
 'high',
 'quality',
 'and',
 'the',
 'app',
 'is',
 'easy',
 'to',
 'use',
 '.',
 'Also',
 'very',
 'quick',
 'and',
 'friendly',
 'support',
 '.']

In [4]:
# Tokenizing all rows from dataframe
review_tokens = df['Review'].apply(word_tokenize)
review_tokens

0        [Great, music, service, ,, the, audio, is, hig...
1        [Please, ignore, previous, negative, rating, ....
2        [This, pop-up, ``, Get, the, best, Spotify, ex...
3        [Really, buggy, and, terrible, to, use, as, of...
4        [Dear, Spotify, why, do, I, get, songs, that, ...
                               ...                        
61589    [Even, though, it, was, communicated, that, ly...
61590    [Use, to, be, sooo, good, back, when, I, had, ...
61591    [This, app, would, be, good, if, not, for, it,...
61592    [The, app, is, good, hard, to, navigate, and, ...
61593    [Its, good, but, sometimes, it, doesnt, load, ...
Name: Review, Length: 61594, dtype: object

# Removing Punctuation

In [5]:
# Imports
import string

# Creating translator object
translator = str.maketrans('', '', string.punctuation)

# Removing punctuation from sentence
punct_removed = sentence.translate(translator)

# Printing sentence with and without punctuation
punct_removed, sentence

('Great music service the audio is high quality and the app is easy to use Also very quick and friendly support',
 'Great music service, the audio is high quality and the app is easy to use. Also very quick and friendly support.')

In [6]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
# Removing punctuation of all rows
reviews_punct_removed = df['Review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
reviews_punct_removed

0        Great music service the audio is high quality ...
1        Please ignore previous negative rating This ap...
2        This popup Get the best Spotify experience on ...
3          Really buggy and terrible to use as of recently
4        Dear Spotify why do I get songs that I didnt p...
                               ...                        
61589    Even though it was communicated that lyrics fe...
61590    Use to be sooo good back when I had it and whe...
61591    This app would be good if not for it taking ov...
61592    The app is good hard to navigate and wont just...
61593    Its good but sometimes it doesnt load the musi...
Name: Review, Length: 61594, dtype: object

In [8]:
reviews_punct_removed[0]

'Great music service the audio is high quality and the app is easy to use Also very quick and friendly support'

# Lower Casing

In [9]:
# Lower Casing all rows
reviews_lower = df['Review'].str.lower()
reviews_lower

0        great music service, the audio is high quality...
1        please ignore previous negative rating. this a...
2        this pop-up "get the best spotify experience o...
3          really buggy and terrible to use as of recently
4        dear spotify why do i get songs that i didn't ...
                               ...                        
61589    even though it was communicated that lyrics fe...
61590    use to be sooo good back when i had it, and wh...
61591    this app would be good if not for it taking ov...
61592    the app is good hard to navigate and won't jus...
61593    its good but sometimes it doesnt load the musi...
Name: Review, Length: 61594, dtype: object

In [10]:
# printing Reviews to compare
df['Review']

0        Great music service, the audio is high quality...
1        Please ignore previous negative rating. This a...
2        This pop-up "Get the best Spotify experience o...
3          Really buggy and terrible to use as of recently
4        Dear Spotify why do I get songs that I didn't ...
                               ...                        
61589    Even though it was communicated that lyrics fe...
61590    Use to be sooo good back when I had it, and wh...
61591    This app would be good if not for it taking ov...
61592    The app is good hard to navigate and won't jus...
61593    Its good but sometimes it doesnt load the musi...
Name: Review, Length: 61594, dtype: object

# Removing Stopwords

In [11]:
# Importing
from nltk.corpus import stopwords

# Getting list of english stopwords
ENG_STOPWORDS = stopwords.words('english')

# Removing stopwords from all rows
review_wo_stopwords = df['Review'].apply(lambda review: ' '.join([word for word in review.split() if word not in ENG_STOPWORDS]))
review_wo_stopwords

0        Great music service, audio high quality app ea...
1        Please ignore previous negative rating. This a...
2        This pop-up "Get best Spotify experience Andro...
3                       Really buggy terrible use recently
4        Dear Spotify I get songs I put playlist??? And...
                               ...                        
61589    Even though communicated lyrics feature availa...
61590    Use sooo good back I it, I downloaded (free ve...
61591    This app would good taking device on. I start ...
61592    The app good hard navigate let play song click...
61593    Its good sometimes doesnt load music plays lik...
Name: Review, Length: 61594, dtype: object

In [12]:
# Comparing sentence with and without stepwords
df['Review'][0], review_wo_stopwords[0]

('Great music service, the audio is high quality and the app is easy to use. Also very quick and friendly support.',
 'Great music service, audio high quality app easy use. Also quick friendly support.')

# Stemming

In [13]:
# Importing
from nltk.stem.porter import PorterStemmer

# Stemmer object
stemmer = PorterStemmer()

# List of words to be stemmed
list_words = ['study', 'studied', 'studying', 'studies']

# Result of words stemmed
[stemmer.stem(word) for word in list_words]

['studi', 'studi', 'studi', 'studi']

In [14]:
# Stemming all rows
review_stemmed = df['Review'].apply(lambda x: ' '.join(stemmer.stem(word) for word in x.split()))
review_stemmed

0        great music service, the audio is high qualiti...
1        pleas ignor previou neg rating. thi app is sup...
2        thi pop-up "get the best spotifi experi on and...
3             realli buggi and terribl to use as of recent
4        dear spotifi whi do i get song that i didn't p...
                               ...                        
61589    even though it wa commun that lyric featur is ...
61590    use to be sooo good back when i had it, and wh...
61591    thi app would be good if not for it take over ...
61592    the app is good hard to navig and won't just l...
61593    it good but sometim it doesnt load the music o...
Name: Review, Length: 61594, dtype: object

# Lemmatization

In [15]:
# Importing
from nltk.stem import WordNetLemmatizer

# Lemmatizer object
lemmatizer = WordNetLemmatizer()

# List of words to be stemmed
list_words = ['study', 'studied', 'studying', 'studies']

# Result of words lemmatized
[lemmatizer.lemmatize(word) for word in list_words]

['study', 'studied', 'studying', 'study']

In [16]:
# Lemmatization on all rows
review_lemmed = df['Review'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))
review_lemmed

0        Great music service, the audio is high quality...
1        Please ignore previous negative rating. This a...
2        This pop-up "Get the best Spotify experience o...
3           Really buggy and terrible to use a of recently
4        Dear Spotify why do I get song that I didn't p...
                               ...                        
61589    Even though it wa communicated that lyric feat...
61590    Use to be sooo good back when I had it, and wh...
61591    This app would be good if not for it taking ov...
61592    The app is good hard to navigate and won't jus...
61593    Its good but sometimes it doesnt load the musi...
Name: Review, Length: 61594, dtype: object

In [17]:
df['Review'][0]

'Great music service, the audio is high quality and the app is easy to use. Also very quick and friendly support.'

# Full Code Example

In [18]:
# Importing 
import pandas as pd
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Loading Dataset
df = pd.read_csv(r'C:\Users\AlvaroMatsuda\OneDrive - DHAUZ\Área de Trabalho\Kazu_Files\Curso_FIA\Deep_Learning\Projeto_aula_deep_learning\base_dados\reviews.csv')

# Removing punctuation of all rows
df['review_pp'] = df['Review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Lower Casing all rows
df['review_pp'] = df['review_pp'].str.lower()

# Getting list of english stopwords
ENG_STOPWORDS = stopwords.words('english')

# Removing stopwords from all rows
df['review_pp'] = df['review_pp'].apply(lambda review: ' '.join([word for word in review.split() if word not in ENG_STOPWORDS]))

# Stemmer object
stemmer = PorterStemmer()

# Stemming all rows
df['review_pp_stem'] = df['review_pp'].apply(lambda x: ' '.join(stemmer.stem(word) for word in x.split()))

# Lemmatizer object
lemmatizer = WordNetLemmatizer()

# Lemmatization on all rows
df['review_pp_lem'] = df['review_pp'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))

# Tokenizing reviews
tokens_lem = df['review_pp_lem'].apply(word_tokenize)
tokens_stem = df['review_pp_stem'].apply(word_tokenize)


# Part 2

# Bag Of Words

In [19]:
# Importing CountVectorizer from Sklearn
from sklearn.feature_extraction.text import CountVectorizer

# Getting only the reviews preprocessed with lemmatization
corpus_lem = df['review_pp_lem']

# Instantiate CountVectorizer object
vectorizer = CountVectorizer()

# Generate Bag of Words
bow = vectorizer.fit_transform(corpus_lem)

In [20]:
# Shape of the sparse matrix generated
bow.shape, corpus_lem.shape

((61594, 28034), (61594,))

In [21]:
# Getting the first two reviews decoded by the bag of words
example_bow = pd.DataFrame(bow[0:2].toarray(), columns=vectorizer.get_feature_names_out())

example_bow[0:2]

Unnamed: 0,00,000,0000,0000000001,0001,0003,000348,0005,0009,001,...,𝚎𝚟𝚎𝚛,𝚑𝚊𝚟𝚎,𝚒𝚜,𝚕𝚘𝚟𝚎𝚍,𝚖𝚘𝚜𝚝,𝚖𝚞𝚜𝚒𝚌,𝚝𝚑𝚎,𝚝𝚑𝚒𝚗𝚔,𝚝𝚑𝚒𝚜,𝟐𝐱
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Getting only columns where there are words present in those reviews
list_columns = [label for label, content in example_bow.items() if any(content == 1)]

# Printing the results
example_bow[list_columns]

Unnamed: 0,also,app,audio,easy,five,friendly,give,great,high,ignore,...,please,previous,quality,quick,rating,service,star,super,support,use
0,1,1,1,1,0,1,0,1,1,0,...,0,0,1,1,0,1,0,0,1,1
1,0,1,0,0,1,0,1,1,0,1,...,1,1,0,0,1,0,1,1,0,0


In [23]:
# Text of the first two reviews
corpus_lem[0], corpus_lem[1]

('great music service audio high quality app easy use also quick friendly support',
 'please ignore previous negative rating app super great give five star')

# TF-IDF

In [24]:
# Importing TfidfVectorizer from Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiating TfidfVectorizer object
tfidfvec = TfidfVectorizer()

# Generating TF-IDF
tfidf = tfidfvec.fit_transform(corpus_lem[:100])

In [25]:
# Getting the first ten reviews decoded by the TF-IDF
example_tfidf = pd.DataFrame(tfidf[0:10].toarray(), columns=tfidfvec.get_feature_names_out())

example_tfidf

Unnamed: 0,10,100,12,34ths,990,ability,able,acc,acceptable,access,...,write,xbox,yeah,year,yes,yo,youre,youth,youtube,youve
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.339939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.437184,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.168682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.175124


In [26]:
# Getting only columns where there are words present in those reviews
list_columns = [label for label, content in example_tfidf.items() if any(content > 0)]

# Printing the results
example_tfidf.loc[0:2, list_columns]

Unnamed: 0,12,ad,add,already,also,android,annoying,app,appear,audio,...,upon,use,used,want,way,whether,wonderful,write,youre,youve
0,0.0,0.0,0.0,0.0,0.312021,0.0,0.0,0.133074,0.0,0.292147,...,0.0,0.228845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137793,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.339939,0.0,0.0,0.0,0.0,0.292066,0.292066,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Text of the first two reviews
corpus_lem[0], corpus_lem[1], corpus_lem[2]

('great music service audio high quality app easy use also quick friendly support',
 'please ignore previous negative rating app super great give five star',
 'popup get best spotify experience android 12 annoying please let get rid')

# N-grams

In [28]:
# Importing CountVectorizer from Sklearn
from sklearn.feature_extraction.text import CountVectorizer

# Getting only the reviews preprocessed with lemmatization
corpus_lem = df['review_pp_lem']

# Instantiate CountVectorizer object
vectorizer = CountVectorizer(ngram_range=(2, 2)) # Bigram

# Generate Bag of Words
bongram = vectorizer.fit_transform(corpus_lem)

In [29]:
# Shape of the sparse matrix generated
bongram.shape, corpus_lem.shape

((61594, 328380), (61594,))

In [30]:
# Getting the first two reviews decoded by the bag of words
example_bongram = pd.DataFrame(bongram[0:2].toarray(), columns=vectorizer.get_feature_names_out())

example_bongram[0:2]

Unnamed: 0,00 anyway,000 audio,000 letting,000 showing,0000 eventhough,0000 please,0001 cent,0001 star,0003 0005,0003 per,...,𝚊𝚙𝚙 𝚑𝚊𝚟𝚎,𝚎𝚟𝚎𝚛 𝚕𝚘𝚟𝚎𝚍,𝚑𝚊𝚟𝚎 𝚎𝚟𝚎𝚛,𝚒𝚜 𝚝𝚑𝚎,𝚖𝚘𝚜𝚝 𝚊𝚖𝚊𝚣𝚒𝚗𝚐,𝚖𝚞𝚜𝚒𝚌 𝚊𝚙𝚙,𝚝𝚑𝚎 𝚖𝚘𝚜𝚝,𝚝𝚑𝚒𝚗𝚔 𝚝𝚑𝚒𝚜,𝚝𝚑𝚒𝚜 𝚒𝚜,𝟐𝐱 𝐄𝐱𝐭𝐫𝐚
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# Getting only columns where there are words present in those reviews
list_columns = [label for label, content in example_bongram.items() if any(content > 0)]

# Printing the results
example_bongram.loc[0:2, list_columns]

Unnamed: 0,also quick,app easy,app super,audio high,easy use,five star,friendly support,give five,great give,great music,...,music service,negative rating,please ignore,previous negative,quality app,quick friendly,rating app,service audio,super great,use also
0,1,1,0,1,1,0,1,0,0,1,...,1,0,0,0,1,1,0,1,0,1
1,0,0,1,0,0,1,0,1,1,0,...,0,1,1,1,0,0,1,0,1,0


In [32]:
# Text of the first two reviews
corpus_lem[0], corpus_lem[1]

('great music service audio high quality app easy use also quick friendly support',
 'please ignore previous negative rating app super great give five star')