# Import all needed libraries

In [1]:
# Data handling
import numpy as np
import pandas as pd

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Text processing
import re
import string
import emoji
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Overview of the dataset

In [24]:
df = pd.read_csv("netflix_reviews.csv")

In [25]:
df.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,appVersion
0,cc1cfcd2-dc8a-4ead-88d1-7f2b2dbb2662,NR Bharadwaj,Plsssss stoppppp giving screen limit like when...,2,0,8.120.0 build 10 50712,2024-07-02 17:17:53,8.120.0 build 10 50712
1,7dfb1f90-f185-4e81-a97f-d38f0128e5a4,Maxwell Ntloko,Good,5,1,,2024-06-26 15:38:06,
2,3009acc4-8554-41cf-88de-cc5e2f6e45b2,Dilhani Mahanama,👍👍,5,0,,2024-06-24 15:29:54,
3,b3d27852-9a3b-4f74-9e16-15434d3ee324,Karen Gulli,Good,3,0,,2024-06-22 15:41:54,
4,8be10073-2368-4677-b828-9ff5d06ea0b7,Ronny Magadi,"App is useful to certain phone brand ,,,,it is...",1,0,8.105.0 build 15 50626,2024-06-22 05:16:03,8.105.0 build 15 50626


In [26]:
df.describe()

Unnamed: 0,score,thumbsUpCount
count,113610.0,113610.0
mean,2.812613,10.434724
std,1.700543,101.013411
min,1.0,0.0
25%,1.0,0.0
50%,3.0,0.0
75%,5.0,1.0
max,5.0,8032.0


In [27]:
print("Shape of the dataset:", df.shape)

Shape of the dataset: (113610, 8)


In [28]:
print("Columns in the dataset:", df.columns)

Columns in the dataset: Index(['reviewId', 'userName', 'content', 'score', 'thumbsUpCount',
       'reviewCreatedVersion', 'at', 'appVersion'],
      dtype='object')


# Preprocessing

From these columns only the content and the score will be of use in the end so we drop the rest.

In [29]:
df = df[['reviewId','content','score']]
df.head()

Unnamed: 0,reviewId,content,score
0,cc1cfcd2-dc8a-4ead-88d1-7f2b2dbb2662,Plsssss stoppppp giving screen limit like when...,2
1,7dfb1f90-f185-4e81-a97f-d38f0128e5a4,Good,5
2,3009acc4-8554-41cf-88de-cc5e2f6e45b2,👍👍,5
3,b3d27852-9a3b-4f74-9e16-15434d3ee324,Good,3
4,8be10073-2368-4677-b828-9ff5d06ea0b7,"App is useful to certain phone brand ,,,,it is...",1


With the following function we can check the number and the percentage of the missing and duplicated values in comparison with the whole dataset.

In [30]:
def show_details(dataset):
    missed_values = dataset.isnull().sum()
    missed_values_percent = (dataset.isnull().sum()) / len(dataset)
    duplicated_values = dataset.duplicated().sum()
    duplicated_values_percent = (dataset.duplicated().sum()) / len(dataset)
    info_frame = pd.DataFrame({'Missing_Values' : missed_values , 
                              'Missing_Values %' :missed_values_percent,
                              'Duplicated values' :duplicated_values,
                              'Duplicated values %':duplicated_values_percent})
    return info_frame.T

In [31]:
show_details(df)

Unnamed: 0,reviewId,content,score
Missing_Values,0.0,2.0,0.0
Missing_Values %,0.0,1.8e-05,0.0
Duplicated values,316.0,316.0,316.0
Duplicated values %,0.002781,0.002781,0.002781


We notice that there are no missing values, while the duplicated values are a tiny percentage, so we decide to drop them.

In [32]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
show_details(df)

Unnamed: 0,reviewId,content,score
Missing_Values,0.0,0.0,0.0
Missing_Values %,0.0,0.0,0.0
Duplicated values,0.0,0.0,0.0
Duplicated values %,0.0,0.0,0.0


We need to translate the scores/ratings into an output value which will serve as the class in our classification problem. We decide to do so by dividing the scores into 3 ranges:
- negative if score = [1,2]
- neutral if score = 3
- positive if score = [4,5]

Later on we will use a LabelEncoder before passing them as labels into our models.

In [33]:
df['sentiment_label'] = df['score'].apply(lambda x: 'positive' if x > 3 else ('negative' if x < 3 else 'neutral'))
df.head()

Unnamed: 0,reviewId,content,score,sentiment_label
0,cc1cfcd2-dc8a-4ead-88d1-7f2b2dbb2662,Plsssss stoppppp giving screen limit like when...,2,negative
1,7dfb1f90-f185-4e81-a97f-d38f0128e5a4,Good,5,positive
2,3009acc4-8554-41cf-88de-cc5e2f6e45b2,👍👍,5,positive
3,b3d27852-9a3b-4f74-9e16-15434d3ee324,Good,3,neutral
4,8be10073-2368-4677-b828-9ff5d06ea0b7,"App is useful to certain phone brand ,,,,it is...",1,negative


Then we will start handling the reviews in the "content" column.

First, we define a function that turns emojis into their respective name. Emojis can have great significance in a review, as they can showcase the feelings of the writer. Therefore, we decided to keep them translated into text instead of just removing them.

Then, we define the text_cleaner function, which turns the characters into lowercase, replaces the emojis, fixes basic grammatical errors and removes urls, html tags, numbers, punctuations and extra spaces. 

In [34]:
def emoji_replacer(text):
    # Function to replace emoji with its name
    def emoji_replacer(match):
        emoji_char = match.group(0)
        emoji_name = emoji.demojize(emoji_char)
        # Removing colons from the emoji name
        return emoji_name.replace(':', ' ')

    # Regular expression pattern to match emojis
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    
    # Replace emojis with their respective names
    return emoji_pattern.sub(emoji_replacer, text)

In [35]:
def text_cleaner(text):
    
    # Convert text to lowercase
    text = text.lower()
    
    # Replace emojis
    text = emoji_replacer(text)
    
    # Fix possible grammatical problems
    text = re.sub(r"won\'t", "would not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"don\'t", "do not", text)
    text = re.sub(r"shouldn\'t", "should not", text)
    text = re.sub(r"needn\'t", "need not", text)
    text = re.sub(r"hasn\'t", "has not", text)
    text = re.sub(r"haven\'t", "have not", text)
    text = re.sub(r"weren\'t", "were not", text)
    text = re.sub(r"mightn\'t", "might not", text)
    text = re.sub(r"didn\'t", "did not", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)
    # Remove URLs
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove numbers and punctuations
    text = re.sub(r'[^a-zA-Z0-9\!\?\.\@]',' ' , text)
    text = re.sub(r'[!]+' , '!' , text)
    text = re.sub(r'[?]+' , '?' , text)
    text = re.sub(r'[.]+' , '.' , text)
    text = re.sub(r'[@]+' , '@' , text)
    text = re.sub(r'unk' , ' ' , text)
    
    # Replace newline and tabs with a space
    text = re.sub(r'[\n\t]+', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text            

In [36]:
# Cleaning up the text. Applying it to the dataset. Creating a new column with the cleaned up review.

df['content_cleaned'] = df['content'].apply(text_cleaner)
df.head()

Unnamed: 0,reviewId,content,score,sentiment_label,content_cleaned
0,cc1cfcd2-dc8a-4ead-88d1-7f2b2dbb2662,Plsssss stoppppp giving screen limit like when...,2,negative,plsssss stoppppp giving screen limit like when...
1,7dfb1f90-f185-4e81-a97f-d38f0128e5a4,Good,5,positive,good
2,3009acc4-8554-41cf-88de-cc5e2f6e45b2,👍👍,5,positive,thumbs up thumbs up
3,b3d27852-9a3b-4f74-9e16-15434d3ee324,Good,3,neutral,good
4,8be10073-2368-4677-b828-9ff5d06ea0b7,"App is useful to certain phone brand ,,,,it is...",1,negative,app is useful to certain phone brand it is not...


Below we see an example before and after applying the text_cleaner function

In [37]:
df['content'][4]

'App is useful to certain phone brand ,,,,it is not excepted to all the phone ,,,,I have tried so many phone to use but no results yet to be determined by me'

In [38]:
df['content_cleaned'][4]

'app is useful to certain phone brand it is not excepted to all the phone i have tried so many phone to use but no results yet to be determined by me'

Now the text is ready for the next steps of preprocessing.

Next, we will remove the stop words from the text and lemmatize it.

Stop words are words which are very common in a language, such as "the", "is" and "a". Usually they are deleted, since they increase the volume of the text without adding any value to it. We use the NLTK stopwords package in English, which contains 179 stop words.

Lemmatization is a text normalization process in natural language processing (NLP) that reduces words to their base or root form, known as the lemma. Unlike stemming, which does a similar job by often crudely cutting off word endings to achieve this, lemmatization uses linguistic knowledge about a word's morphology and context to ensure that the base form is a valid word. Some examples are:
- "dogs" -> "dog"
- "worse" -> "bad"
- "playing" -> "play"

We use the spacy library lemmatization function to achieve this.

In [39]:
stop_words = set(stopwords.words('english'))

# Remove specific words from the stopwords list
words_to_remove = ['up', 'down']  # Example words to remove
stop_words = [word for word in stop_words if word not in words_to_remove]


def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
df['content_cleaned'] = df['content_cleaned'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,reviewId,content,score,sentiment_label,content_cleaned
0,cc1cfcd2-dc8a-4ead-88d1-7f2b2dbb2662,Plsssss stoppppp giving screen limit like when...,2,negative,plsssss stoppppp giving screen limit like ur w...
1,7dfb1f90-f185-4e81-a97f-d38f0128e5a4,Good,5,positive,good
2,3009acc4-8554-41cf-88de-cc5e2f6e45b2,👍👍,5,positive,thumbs up thumbs up
3,b3d27852-9a3b-4f74-9e16-15434d3ee324,Good,3,neutral,good
4,8be10073-2368-4677-b828-9ff5d06ea0b7,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand excepted phone ...


In [40]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

df["content_cleaned"] = df["content_cleaned"].apply(lambda text: lemmatize_text(text))
df.head()

Unnamed: 0,reviewId,content,score,sentiment_label,content_cleaned
0,cc1cfcd2-dc8a-4ead-88d1-7f2b2dbb2662,Plsssss stoppppp giving screen limit like when...,2,negative,plsssss stoppppp give screen limit like ur wat...
1,7dfb1f90-f185-4e81-a97f-d38f0128e5a4,Good,5,positive,good
2,3009acc4-8554-41cf-88de-cc5e2f6e45b2,👍👍,5,positive,thumb up thumb up
3,b3d27852-9a3b-4f74-9e16-15434d3ee324,Good,3,neutral,good
4,8be10073-2368-4677-b828-9ff5d06ea0b7,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...


Below we see an example before and after removing the stop words and lemmatizing

In [41]:
df['content'][4]

'App is useful to certain phone brand ,,,,it is not excepted to all the phone ,,,,I have tried so many phone to use but no results yet to be determined by me'

In [42]:
df['content_cleaned'][4]

'app useful certain phone brand except phone try many phone use result yet determine'

Now our dataframe preprocessing is finished and the reviews are ready to be passed on to the next step which is vectorising

# Vectorization