In [1]:
#import libraries
import pandas as pd
import numpy as np
import re 
import string 
import nltk

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import textblob
from textblob import TextBlob

In [2]:
#load the data
John_wick_data = pd.read_csv("John_Wick_4")
John_wick_data.head(2)

Unnamed: 0.1,Unnamed: 0,title,review,rating,date
0,0,A new standard has been set for fight scenes,Half of this review will be me gushing about t...,9/10,24 March 2023
1,1,Yeah,By now you know what to expect from a John Wic...,9/10,23 March 2023


## Data Overview

In [3]:
#shape of the data
John_wick_data.shape

(866, 5)

In [4]:
#checking the data makeup
John_wick_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 866 entries, 0 to 865
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  866 non-null    int64 
 1   title       866 non-null    object
 2   review      866 non-null    object
 3   rating      866 non-null    object
 4   date        866 non-null    object
dtypes: int64(1), object(4)
memory usage: 34.0+ KB


In [5]:
#checking for missing values
John_wick_data.isnull().sum()

Unnamed: 0    0
title         0
review        0
rating        0
date          0
dtype: int64

In [6]:
#checking for duplicates
John_wick_data.duplicated().sum()

0

In [7]:
#checking for unique dataset
John_wick_data.nunique()

Unnamed: 0    866
title         859
review        866
rating         10
date           42
dtype: int64

## Data Cleaning

In [8]:
#drop the unnamed column
John_wick_data.drop(columns = "Unnamed: 0", axis = 1, inplace = True)

In [9]:
#remove the '/10' in the rating column
John_wick_data['rating'] = John_wick_data['rating'].str.replace("/10", "")

In [10]:
#convert the date and rating columns to numeric and date data type respectively
John_wick_data['rating'] = pd.to_numeric(John_wick_data['rating'])

John_wick_data['date'] = pd.to_datetime(John_wick_data['date'])

In [11]:
#Convert the review to lowercase
John_wick_data['review'] = John_wick_data['review'].str.lower()

## Data Preprocessing

In [33]:
nltk.download('punkt')

#remove tags
def remove_tags(review):
    clean_review = re.sub('<.*?>', '', review)
    return clean_review
John_wick_data['review'] = John_wick_data['review'].apply(remove_tags)


#remove repeating characters
def remove_characters(review):
    clean_review = re.sub(r'\@\w+|\#\w+|\d+', '', review)
    return clean_review
John_wick_data['review'] = John_wick_data['review'].apply(remove_characters)


#remove punctuations
def remove_punctuation(review):
    clean_review = review.translate(str.maketrans('', '', string.punctuation))
    return clean_review
John_wick_data['review'] = John_wick_data['review'].apply(remove_punctuation)


#remove URL
def remove_url(review):
    clean_review = re.sub(r"http\S+|www\S+|https\S+", '', review, flags = re.MULTILINE)
    return clean_review
John_wick_data['review'] = John_wick_data['review'].apply(remove_url)


#tokenize the reviews
def tokenize_rev(review):
    clean_review_token = word_tokenize(review)
    return clean_review_token
John_wick_data['review'] = John_wick_data['review'].apply(tokenize_rev)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iwuan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
#remove stop words from the reviews
nltk.download("stopwords")
stop_words = stopwords.words('english')

def remove_stop_words(clean_review_token):
    filtered_tokens = [word for word in clean_review_token if word not in stop_words]
    return filtered_tokens
John_wick_data['review'] = John_wick_data['review'].apply(remove_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iwuan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
#lemmatization of the tokens
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(clean_review_token):
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in clean_review_token]
    return lemmatized_tokens

John_wick_data['review'] = John_wick_data['review'].apply(lemmatize_tokens)


#Join the tokens back into text strings
def join_tokens(clean_review_token):
    revs = " ".join(clean_review_token)
    return revs
John_wick_data['review'] = John_wick_data['review'].apply(join_tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\iwuan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\iwuan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Sentiment Analysis

In [36]:
def polarity(review):
    return TextBlob(review).sentiment.polarity

In [37]:
John_wick_data['polarity'] = John_wick_data['review'].apply(polarity)

In [38]:
def SentimentTextBlob(polarity):
    if polarity < 0:
        return "Negative"
    elif polarity == 0:
        return "Neutral"
    else:
        return "Positive"

In [39]:
John_wick_data['Sentiment']=John_wick_data['polarity'].apply(SentimentTextBlob)
John_wick_data.head(3)

Unnamed: 0,title,review,rating,date,polarity,Sentiment
0,A new standard has been set for fight scenes,half review gushing action wow wow complete aw...,9,2023-03-24,0.191175,Positive
1,Yeah,know expect john wick movie thought franchise ...,9,2023-03-23,0.127741,Positive
2,"Not Just The Best John Wick, But Possibly One ...",ever since original john wick franchise set st...,10,2023-03-17,0.16463,Positive


In [40]:
#Checking for the value count of sentiment column
John_wick_data.groupby(["Sentiment"])["polarity"].count().reset_index()

Unnamed: 0,Sentiment,polarity
0,Negative,98
1,Neutral,3
2,Positive,765


In [41]:
#Save the cleaned data to a CSV file for visualization
John_wick_data.to_csv('cleaned_john_wick_data.csv', index=False)