# Import all needed libraries

In [1]:
# Data handling
import numpy as np
import pandas as pd

# Text processing
import re
import string
import emoji
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

We will take the cleaned up dataset and we will do all the text preprocessing needed in this notebook.

In [2]:
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.head()

Unnamed: 0,Content,Score,Sentiment
0,Plsssss stoppppp giving screen limit like when...,2,negative
1,Good,5,positive
2,👍👍,5,positive
3,Good,3,neutral
4,"App is useful to certain phone brand ,,,,it is...",1,negative


In [4]:
#df.fillna('', inplace=True) # Filling the 2 missing values

#df['at'] = pd.to_datetime(df['at']) # Making pandas understand the date value

# There shouldn't be any duplicate

Then we will start handling the reviews in the "content" column.

First, we define a function that turns emojis into their respective name. Emojis can have great significance in a review, as they can showcase the feelings of the writer. Therefore, we decided to keep them translated into text instead of just removing them.

Then, we define the text_cleaner function, which turns the characters into lowercase, replaces the emojis, fixes basic grammatical errors and removes urls, html tags, numbers, punctuations and extra spaces. 

In [5]:
def emoji_replacer(text):
    # Function to replace emoji with its name
    def emoji_replacer(match):
        emoji_char = match.group(0)
        emoji_name = emoji.demojize(emoji_char)
        # Removing colons from the emoji name
        return emoji_name.replace(':', ' ')

    # Regular expression pattern to match emojis
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    
    # Replace emojis with their respective names
    return emoji_pattern.sub(emoji_replacer, text)

In [6]:
def text_cleaner(text):
    
    # Convert text to lowercase
    text = text.lower()
    
    # Replace emojis
    text = emoji_replacer(text)
    
    # Fix possible grammatical problems
    text = re.sub(r"won\'t", "would not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"don\'t", "do not", text)
    text = re.sub(r"shouldn\'t", "should not", text)
    text = re.sub(r"needn\'t", "need not", text)
    text = re.sub(r"hasn\'t", "has not", text)
    text = re.sub(r"haven\'t", "have not", text)
    text = re.sub(r"weren\'t", "were not", text)
    text = re.sub(r"mightn\'t", "might not", text)
    text = re.sub(r"didn\'t", "did not", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)
    # Remove URLs
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove numbers and punctuations
    text = re.sub(r'[^a-zA-Z0-9\!\?\.\@]',' ' , text)
    text = re.sub(r'[!]+' , '!' , text)
    text = re.sub(r'[?]+' , '?' , text)
    text = re.sub(r'[.]+' , '.' , text)
    text = re.sub(r'[@]+' , '@' , text)
    text = re.sub(r'unk' , ' ' , text)
    
    # Replace newline and tabs with a space
    text = re.sub(r'[\n\t]+', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text            

In [7]:
# Cleaning up the text. Applying it to the dataset. Creating a new column with the cleaned up review.

df['Content_cleaned'] = df['Content'].apply(text_cleaner)
df.head()

Unnamed: 0,Content,Score,Sentiment,Content_cleaned
0,Plsssss stoppppp giving screen limit like when...,2,negative,plsssss stoppppp giving screen limit like when...
1,Good,5,positive,good
2,👍👍,5,positive,thumbs up thumbs up
3,Good,3,neutral,good
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app is useful to certain phone brand it is not...


Below we see an example before and after applying the text_cleaner function

In [8]:
df['Content'][4]

'App is useful to certain phone brand ,,,,it is not excepted to all the phone ,,,,I have tried so many phone to use but no results yet to be determined by me'

In [9]:
df['Content_cleaned'][4]

'app is useful to certain phone brand it is not excepted to all the phone i have tried so many phone to use but no results yet to be determined by me'

Now the text is ready for the next steps of preprocessing.

Next, we will remove the stop words from the text and lemmatize it.

Stop words are words which are very common in a language, such as "the", "is" and "a". Usually they are deleted, since they increase the volume of the text without adding any value to it. We use the NLTK stopwords package in English, which contains 179 stop words.

Lemmatization is a text normalization process in natural language processing (NLP) that reduces words to their base or root form, known as the lemma. Unlike stemming, which does a similar job by often crudely cutting off word endings to achieve this, lemmatization uses linguistic knowledge about a word's morphology and context to ensure that the base form is a valid word. Some examples are:
- "dogs" -> "dog"
- "worse" -> "bad"
- "playing" -> "play"

We use the spacy library lemmatization function to achieve this.

In [10]:
stop_words = set(stopwords.words('english'))

# Remove specific words from the stopwords list
words_to_remove = ['up', 'down']  # Example words to remove
stop_words = [word for word in stop_words if word not in words_to_remove]


def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
df['Content_cleaned'] = df['Content_cleaned'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,Content,Score,Sentiment,Content_cleaned
0,Plsssss stoppppp giving screen limit like when...,2,negative,plsssss stoppppp giving screen limit like ur w...
1,Good,5,positive,good
2,👍👍,5,positive,thumbs up thumbs up
3,Good,3,neutral,good
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand excepted phone ...


In [11]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

df["Content_cleaned"] = df["Content_cleaned"].apply(lambda text: lemmatize_text(text))
df.head()

Unnamed: 0,Content,Score,Sentiment,Content_cleaned
0,Plsssss stoppppp giving screen limit like when...,2,negative,plsssss stoppppp give screen limit like ur wat...
1,Good,5,positive,good
2,👍👍,5,positive,thumb up thumb up
3,Good,3,neutral,good
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...


Below we see an example before and after removing the stop words and lemmatizing

In [12]:
df['Content'][4]

'App is useful to certain phone brand ,,,,it is not excepted to all the phone ,,,,I have tried so many phone to use but no results yet to be determined by me'

In [13]:
df['Content_cleaned'][4]

'app useful certain phone brand except phone try many phone use result yet determine'

In [14]:
df.to_csv('preprocessed_text.csv', index=False)

Now our dataframe preprocessing is finished and the reviews are ready to be passed on to the next step which is vectorization.