# NLP Project Steps

In [None]:
import pandas as pd
from textblob.en import positive

## 1. Data Collection

We have already collected the data from Kaggle and stored it in DataSets folder

In [None]:
df = pd.read_csv('../DataSets/IMDB Dataset 50k review.csv')

In [None]:
df.head()

In [None]:
df.info()
# no missing values are present

In [None]:
df.duplicated().sum()
# so we have 418 duplicate rows

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
# df = df.iloc[:1000]     # taking only 1000 rows for faster processing .

## 2. Data Cleaning

* We will perform the following steps to clean the data:
    - Lower Case
    - Remove trailing and leading spaces
    - Remove HTML tags
    - Remove URLs
    - Expanding Abbreviations
    - Spelling Correction
    - Remove Punctuation

In [None]:
# Lower Case
df['review'] = df['review'].str.lower()
df.head()

In [None]:
# Remove trailing and leading spaces
df['review'] = df['review'].str.strip()

In [None]:
# Remove HTML tags

# method 1
# import re                               # regular expression library
# def remove_html_tags(data):
#     data = re.sub(r'<.*?>','', data)    #
#     return data

# remove_html_tags('<p>This is a <b>bold</b> paragraph.</p>')

# df['review'] = df['review'].apply(remove_html_tags)
# df.head()

# method 2 ( Better method )
df['review'] = df['review'].str.replace(r'<.*?>', '', regex=True)   # here regex=True is important since we are using regex pattern without it it will give warning .
df.head()

In [None]:
# finding reviews with one or more url in it
df[df['review'].str.contains(r"https?://\S+|www\.\S+")].iloc[1].values

In [None]:
# Remove URLs
# http://google.in      www.google.com      www.india.gov.in
# so we have multiple types of urls

# Method 1
import re

def remove_url(data):
    data = re.sub(r"https?://\S+|www\.\S+", "", data)
    return data

# remove_url("Check out this link: https://example.com and also visit www.example.org for more info and http://test.com")

# df['review'] = df['review'].apply(remove_url)

# Method 2 ( Better method )
df['review'] = df['review'].str.replace(r"https?://\S+|www\.\S+", "", regex=True)
df.head()

In [None]:
# expanding abbreviations
# he's  -> he is and so on
# we will use regex for this task
# we will create a function to do this task and then apply it to the dataframe

import re

def remove_abb(data):
    data = re.sub(r"he's", "he is", data)
    data = re.sub(r"there's", "there is", data)
    data = re.sub(r"We're", "We are", data)
    data = re.sub(r"That’s", "That is", data)
    data = re.sub(r"won’t", "will not", data)
    data = re.sub(r"they’re", "they are", data)
    data = re.sub(r"Can’t", "Cannot", data)
    data = re.sub(r"wasn’t", "was not", data)
    data = re.sub(r"don\x89Ûat", "do not", data)
    data = re.sub(r"aren’t", "are not", data)
    data = re.sub(r"isn’t", "is not", data)
    data = re.sub(r"What's", "What is", data)
    data = re.sub(r"haven’t", "have not", data)
    data = re.sub(r"hasn’t", "has not", data)
    data = re.sub(r"There’s", "There is", data)
    data = re.sub(r"He’s", "He is", data)
    data = re.sub(r"It’s", "It is", data)
    data = re.sub(r"You’re", "You are", data)
    data = re.sub(r"I’M", "I am", data)
    data = re.sub(r"shouldn’t", "should not", data)
    data = re.sub(r"wouldn’t", "would not", data)
    data = re.sub(r"i’m", "I am", data)
    data = re.sub(r"I\x89ÛÏm", "I am", data)
    data = re.sub(r"I\x89Û÷m", "I am", data)
    data = re.sub(r"Isn’t", "is not", data)
    data = re.sub(r"Here’s", "Here is", data)
    data = re.sub(r"You’ve", "you have", data)
    data = re.sub(r"you\x89Ûªve", "you have", data)
    data = re.sub(r"We’re", "we are", data)
    data = re.sub(r"what’s", "what is", data)
    data = re.sub(r"couldn’t", "could not", data)
    data = re.sub(r"we’ve", "we have", data)
    data = re.sub(r"it\x89Ûªs", "it is", data)
    data = re.sub(r"doesn\x89Ûªt", "does not", data)
    data = re.sub(r"It\x89Ûªs", "It is", data)
    data = re.sub(r"Here\x89Ûªs", "Here is", data)
    data = re.sub(r"who’s", "who is", data)
    data = re.sub(r"I\x89Ûªve", "I have", data)
    data = re.sub(r"y’all", "you all", data)
    data = re.sub(r"can\x89Ûªt", "cannot", data)
    data = re.sub(r"would\x89Ûªve", "would have", data)
    data = re.sub(r"it’ll", "it will", data)
    data = re.sub(r"we’ll", "we will", data)
    data = re.sub(r"wouldn\x89Ûªt", "would not", data)
    data = re.sub(r"We\x89Ûªve", "We have", data)
    data = re.sub(r"he’ll", "he will", data)
    data = re.sub(r"Y’all", "You all", data)
    data = re.sub(r"Weren’t", "Were not", data)
    data = re.sub(r"Didn’t", "Did not", data)
    data = re.sub(r"they’ll", "they will", data)
    data = re.sub(r"they’d", "they would", data)
    data = re.sub(r"DON’T", "DO NOT", data)
    data = re.sub(r"That\x89Ûªs", "That is", data)
    data = re.sub(r"they’ve", "they have", data)
    data = re.sub(r"i’d", "I would", data)
    data = re.sub(r"should\x89Ûªve", "should have", data)
    data = re.sub(r"You\x89Ûªre", "You are", data)
    data = re.sub(r"where’s", "where is", data)
    data = re.sub(r"Don\x89Ûªt", "Do not", data)
    data = re.sub(r"We’d", "We would", data)
    data = re.sub(r"i’ll", "I will", data)
    data = re.sub(r"weren’t", "were not", data)
    data = re.sub(r"They’re", "They are", data)
    data = re.sub(r"Can\x89Ûªt", "Cannot", data)
    data = re.sub(r"you\x89Ûªll", "you will", data)
    data = re.sub(r"I\x89Ûªd", "I would", data)
    data = re.sub(r"let’s", "let us", data)
    data = re.sub(r"it’s", "it is", data)
    data = re.sub(r"can’t", "cannot", data)
    data = re.sub(r"don’t", "do not", data)
    data = re.sub(r"you’re", "you are", data)
    data = re.sub(r"i’ve", "I have", data)
    data = re.sub(r"that’s", "that is", data)
    data = re.sub(r"i’ll", "I will", data)
    data = re.sub(r"doesn’t", "does not", data)
    data = re.sub(r"i’d", "I would", data)
    data = re.sub(r"don't", "do not", data)
    data = re.sub(r"you're", "you are", data)
    data = re.sub(r"i've", "I have", data)
    data = re.sub(r"that's", "that is", data)
    data = re.sub(r"i'll", "I will", data)
    data = re.sub(r"doesn't", "does not", data)
    data = re.sub(r"i'd", "I would", data)
    data = re.sub(r"didn't", "did not", data)
    data = re.sub(r"ain't", "am not", data)
    data = re.sub(r"you'll", "you will", data)
    data = re.sub(r"I've", "I have", data)
    data = re.sub(r"Don't", "do not", data)
    data = re.sub(r"I'll", "I will", data)
    data = re.sub(r"I'd", "I would", data)
    data = re.sub(r"Let's", "Let us", data)
    data = re.sub(r"you'd", "You would", data)
    data = re.sub(r"It's", "It is", data)
    data = re.sub(r"Ain't", "am not", data)
    data = re.sub(r"Haven't", "Have not", data)
    data = re.sub(r"Could've", "Could have", data)
    data = re.sub(r"you've", "you have", data)
    data = re.sub(r"donâ't", "do not", data)

    return data

In [None]:
remove_abb("He's going to the park. There's a dog. I'm happy. Don't worry!")

In [None]:
df['review'] = df['review'].apply(remove_abb)
df.head()

In [None]:
# Spelling Correction
# we will use textblob library for this task
# !pip install textblob
from textblob import TextBlob

# text = "hi i can drve at nigt with no ligts"
# TextBlob(text).correct().string

def correct_spell(data):
    return TextBlob(data).correct().string

In [None]:
# df['review'] = df['review'].apply(correct_spell)    # this will take some time to execute

In [None]:
# Remove Punctuation
import string
string.punctuation

In [None]:
def remove_punctuation(data):
    for char in string.punctuation:
        if char in data:
            data = data.replace(char, '')
    return data

In [None]:
# remove_punctuation("Hello, world! is this a test?")

In [None]:
df['review'] = df['review'].apply(remove_punctuation)
df.head()

## 3. Text Preprocessing

- we will perform the following steps to preprocess the text:
    - Tokenization : it is the process of breaking down a text into smaller units called tokens . tokens can be words , phrases , or sentences .
    - Stop Words Removal : these are the most common words in a language that do not add much meaning to a sentence for example : is, am, are, the, a, an, in, on, at, for, to, and, but, or etc .
    - Stemming : it reduces words to their root form , but the root form may not be a valid word .
    - Lemmatization : it reduces words to their base form , and the base form is a valid word .

In [None]:
# Tokenization
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
from nltk.tokenize import word_tokenize
df['Tokenized_review'] = df['review'].apply(word_tokenize)
df.head()

In [None]:
# Stop Words Removal
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
len(stopwords.words('english'))

In [None]:
def remove_stopwords(text):
    L =[]
    for word in text:
        if word not in stop_words:
            L.append(word)
    return L

# The actual time complexity is O(n * m * k), where: n = number of reviews , m = average number of words per review , k = number of stop words (about 179)
# But if you convert stop_words to a set, checking membership becomes O(1), so the complexity is O(n * m).

# stop_words_set = set(stop_words)

# def remove_stopwords(text):
#     return [word for word in text if word not in stop_words_set]


In [None]:
# remove_stopwords(['i', 'thought', 'this', 'was', 'a' , 'wonderful', 'way'])

In [None]:
df['Tokenized_review'] = df['Tokenized_review'].apply(remove_stopwords)      # removing stop words from the tokenized review .
df.head()

## 4.  EDA ( Exploratory Data Analysis ) and Feature Engineering

- we will perform the following steps to analyze the data:
    - distribution of text length / word count
    - common unigrams/bigrams/trigrams
    - wordcloud

In [None]:
# Distribution of text length / word count
# we will create two new columns in the dataframe one for character length and other for word length to analyze the distribution of text length / word count .

In [None]:
df['review'] = df['Tokenized_review'].apply(lambda x : " ".join(x))
df.head()

In [None]:
df['char_len'] = df['review'].str.len()

In [None]:
df

In [None]:
df['word_len'] = df['Tokenized_review'].apply(len)

In [None]:
df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.displot(df['char_len'])

In [None]:
df[ df['sentiment'] == 'positive' ]

In [None]:
sns.displot(df[ df['sentiment'] == 'positive' ]['char_len'])
sns.displot(df[ df['sentiment'] == 'negative' ]['char_len'])

In [None]:
# sns.displot creates a new figure for each call, so each sentiment is plotted separately.
# To show both distributions on a single plot, use sns.histplot and plot both series together, specifying the hue parameter.
import seaborn as sns

sns.histplot(data=df, x='char_len', hue='sentiment', kde=True)

if we compare both the plots and see the difference then we can say that the positive reviews are generally longer than the negative reviews  but here we are not able to see the difference clearly . so , it is not a good feature to classify the reviews .

In [None]:
sns.histplot(data=df, x='word_len', hue='sentiment', kde=True)

this too is not a good feature to classify the reviews .

In [None]:
# Common unigrams/bigrams/trigrams and so on .
    # unigrams : single words in the text
    # bigrams : two consecutive words in the text
    # trigrams : three consecutive words in the text

In [None]:
df

This will take too much time to execute since we have 50k reviews in the dataset . So , we will take only 1000 reviews for faster processing .

In [None]:
temp = df.copy()
df = df.iloc[:10000]
# this will prevent us from running the whole notebook again and again . we have stored the original dataframe in temp variable and it can be used whenever needed .

In [None]:
from nltk import ngrams
# df['Tokenized_review'].sum()    # this will give a list of all the words in the reviews .
pd.Series(ngrams(df['Tokenized_review'].sum(), 3)).value_counts().head(10)

In [None]:
# analyzing positive reviews in trigrams
positive_3grams = pd.Series(ngrams(df[df['sentiment'] == 'positive']['Tokenized_review'].sum(), 3)).value_counts().head(10)
# analyzing negative reviews in trigrams
negative_3grams = pd.Series(ngrams(df[df['sentiment'] == 'negative']['Tokenized_review'].sum(), 3)).value_counts().head(10)
# now we can see the difference between the positive and negative reviews .
sns.histplot(positive_3grams)
sns.histplot(negative_3grams)
# showing both the plots in a single plot
sns.histplot(data=positive_3grams, color='blue', label='Positive Reviews', kde=True)
sns.histplot(data=negative_3grams, color='red', label='Negative Reviews', kde=True)
plt.legend()

In [None]:
# Wordcloud
# !pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

plt.figure(figsize = (20,20)) # Positive Review Text
wc = WordCloud(width = 1600, height = 800).generate(" ".join(df[df['sentiment'] == 'positive']['review']))
plt.imshow(wc)

In [None]:
plt.figure(figsize = (20,20)) # Positive Review Text
wc = WordCloud(width = 1600, height = 800).generate(" ".join(df[df['sentiment'] == 'negative']['review']))
plt.imshow(wc)

## 5. Vectorization
- we will perform the following steps to vectorize the text:
    - bag of words : it is a simple and commonly used method for text vectorization . it represents a text as a bag of its words , disregarding grammar and word order but keeping multiplicity .

In [None]:
df

In [None]:
!pip install scikit-learn

In [None]:
# BoW
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000 , ngram_range=(3,3))   # we are using trigrams here , nrange=(1,1) for unigrams , (2,2) for bigrams and so on . and max_features is used to limit the number of features to 5000 most frequent words .
# ngram_range=(1,3) will consider unigrams , bigrams and trigrams all together , but it will increase the number of features too much .
bag_of_words = count_vectorizer.fit_transform(df['review'])
bag_of_words = pd.DataFrame(bag_of_words.toarray(), columns=count_vectorizer.get_feature_names_out())

In [None]:
bag_of_words

In [None]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(bag_of_words.values)

In [None]:
pca_result.shape

In [None]:
sns.scatterplot(x=pca_result[:,0], y=pca_result[:,1], hue=df['sentiment'])