# Text Pre-Processing

#### Step 1: Import necessary libraries 

In [34]:
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
import string # For string related operations
import seaborn as sns  # For data visualization
import emoji # For emoji analysis
from nltk.corpus import wordnet
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # For lemmatization of words
from nltk.tokenize import word_tokenize  # For tokenization and n-grams generation
from nltk.corpus import stopwords  # For stopwords
import re  # Regular expressions for text cleaning
import matplotlib.pyplot as plt  # For creating visualizations
from sklearn.feature_extraction.text import CountVectorizer  # For converting text data to numerical format
from wordcloud import WordCloud # For generating word clouds
from nltk import pos_tag
import contractions 

#### Step 2: Load the dataset

In [2]:
movie_re = pd.read_csv('IMDB Dataset.csv')
movie_re.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


#### Step 3: Lower Case

In [3]:
movie_re["review_transformed"] = movie_re["review"].str.lower()
movie_re.head()

Unnamed: 0,review,sentiment,review_transformed
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production. <br /><br />the...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"petter mattei's ""love in the time of money"" is..."


In [4]:
movie_re["review_transformed"][0]

"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.<br /><br />it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />i would say the main appeal of the show is due to the fa

#### Step 4: URL Removal

In [5]:
pattern = r'(https://\S+|www\.\S+)' 
movie_re['urls'] = movie_re['review_transformed'].str.extract(pattern)
movie_re['review_transformed']=movie_re['review_transformed'].str.replace(pattern, '', regex=True)

In [6]:
movie_re['urls'].unique()

array([nan, 'www.cei.org.', 'www.invocus.net)', 'www.softfordigging.com',
       'www.petitiononline.com/19784444/petition.html',
       'www.comingsoon.net/films.php?id=36310', 'www.residenthazard.com)',
       'www.zonadvd.com', 'www.nixflix.com', 'www.abc.net.au/chaser.',
       'www.lovetrapmovie.com', 'www.thepetitionsite.com',
       'www.petitiononline.com/gh1215/petition.html',
       'www.johntopping.com/harvey%20perr/war%20widow/war_widow.html',
       'www.mediasickness.com', 'www.imdb.com/title/tt0073891/',
       'www.imdb.com/title/tt0363163/<br', 'www.poffysmoviemania.com)',
       'www.gutenberg.org/ebooks/18137', 'www.reel13.org)',
       'www.cinemablend.com/feature.php?id=209',
       'www.youtube.com/watch?v=rmb4-hyet_y',
       'www.dvdbeaver.com/film/dvdcompare2/kingofmasks.htm<br',
       'www.helium.com/items/1433421-sydney-white-review',
       'www.imdb.com/title/tt0962736/awards',
       'www.screendaily.com/screendailyarticle.aspx?intstoryid=39811',
       '

In [7]:
movie_re['review_transformed'][0]

"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.<br /><br />it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />i would say the main appeal of the show is due to the fa

#### Step 5: HTML Tags

In [8]:
html_pattern = re.compile(r'<.*?>')
movie_re['review_transformed']=movie_re['review_transformed'].str.replace(html_pattern, ' ', regex=True)

In [9]:
movie_re['review_transformed'][0]

"one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.  the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word.  it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.  i would say the main appeal of the show is due to the fact that it goes where other sh

#### Step 6:  Contractions 

In [10]:
movie_re["review_transformed"]=movie_re['review_transformed'].apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))


In [11]:
movie_re["review_transformed"][0]

'one of the other reviewers has mentioned that after watching just 1 oz episode you will be hooked. they are right, as this is exactly what happened with me. the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs, sex or violence. its is hardcore, in the classic use of the word. it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. them city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away. i would say the main appeal of the show is due to the fact that it goes where other s

#### Step 7:  Punctuation Marks


In [12]:
movie_re['review_transformed'] = movie_re['review_transformed'].str.translate(str.maketrans(' ',' ',string.punctuation))
movie_re.head()

Unnamed: 0,review,sentiment,review_transformed,urls
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...,
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,
3,Basically there's a family where a little boy ...,negative,basically there is a family where a little boy...,
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love in the time of money is a ...,


In [13]:
movie_re["review_transformed"][0]

'one of the other reviewers has mentioned that after watching just 1 oz episode you will be hooked they are right as this is exactly what happened with me the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda them city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far away i would say the main appeal of the show is due to the fact that it goes where other shows would not dare forget pre

#### Step 8: Stop Words

In [14]:
stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [15]:
movie_re["review_transformed"] = movie_re['review_transformed'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [16]:
movie_re["review_transformed"][0]

'one reviewers mentioned watching 1 oz episode hooked right exactly happened first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use word called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far away would say main appeal show due fact goes shows would dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz mess around first episode ever saw struck nasty surreal could say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards sold nickel inmates kill order get away well mannered middle class inmates turned prison bitches due l

#### Step 9: Numbers

In [17]:
movie_re['review_transformed'] = movie_re['review_transformed'].str.replace('\d+', '')

  movie_re['review_transformed'] = movie_re['review_transformed'].str.replace('\d+', '')


In [18]:
movie_re.review_transformed[0]

'one reviewers mentioned watching  oz episode hooked right exactly happened first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use word called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far away would say main appeal show due fact goes shows would dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz mess around first episode ever saw struck nasty surreal could say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards sold nickel inmates kill order get away well mannered middle class inmates turned prison bitches due la

#### Step 10: Emoji 

In [19]:
movie_re['emojis'] = movie_re['review_transformed'].apply(lambda row: ''.join(c for c in row if c in emoji.EMOJI_DATA))

In [20]:
movie_re[movie_re['emojis']!='']

Unnamed: 0,review,sentiment,review_transformed,urls,emojis
13735,I checked this movie out based on a favorable ...,negative,checked movie based favorable review page slow...,,®
21141,"""In April 1946, the University of Chicago agre...",positive,april university chicago agreed operate argon...,,®
22728,That's the sound of Stan and Ollie spinning in...,negative,sound stan ollie spinning graves bother listin...,,®
36507,Klatret©ªsen(Catch That Girl) is really great ...,positive,klatret©ªsencatch girl really great movie happ...,,©
46515,I chuckled a few times during this movie. I la...,negative,chuckled times movie laughed loud notarizing m...,,®


# Data Transformation

#### Tokenization

In [21]:
movie_re['tokenized'] = movie_re['review_transformed'].apply(lambda x: word_tokenize(x))

In [22]:
movie_re['tokenized']

0        [one, reviewers, mentioned, watching, oz, epis...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, matteis, love, time, money, visually,...
                               ...                        
49995    [thought, movie, right, good, job, creative, o...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [catholic, taught, parochial, elementary, scho...
49998    [going, disagree, previous, comment, side, mal...
49999    [one, expects, star, trek, movies, high, art, ...
Name: tokenized, Length: 50000, dtype: object

#### Lemmatization

In [28]:
def get_pos_tag(word):
    pos_tag = nltk.pos_tag([word])[0][1][0].upper()
    tags_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV
    }

    return tags_dict.get(pos_tag)

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    tokens = nltk.word_tokenize(text)
    for token in tokens:
        token_tag = get_pos_tag(token)
        if token_tag is None:
            lemmatized_tokens.append(token)
        else:
            lemma = lemmatizer.lemmatize(token, token_tag)
            lemmatized_tokens.append(lemma)
    return ' '.join(lemmatized_tokens)

In [29]:
movie_re['lemmatized_tokens'] = movie_re['review_transformed'].apply(lemmatize)

In [31]:
movie_re['lemmatized_tokens'][0]

'one reviewer mention watch oz episode hooked right exactly happen first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word call oz nickname give oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda city home manyaryans muslim gangsta latino christian italian irish moreso scuffle death stare dodgy dealing shady agreement never far away would say main appeal show due fact go show would dare forget pretty picture paint mainstream audience forget charm forget romanceoz mess around first episode ever saw struck nasty surreal could say ready watch developed taste oz get accustom high level graphic violence violence injustice crooked guard sell nickel inmate kill order get away well mannered middle class inmate turn prison bitch due lack street skill prison experience watch oz may bec

#### Stemming 

In [47]:
stemmer = SnowballStemmer("english")
movie_re['stemmed'] = movie_re['tokenized'].apply(lambda tokens: ' '.join([stemmer.stem(token) for token in tokens])) 

In [48]:
movie_re['stemmed'][0]

'one review mention watch oz episod hook right exact happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus main emerald citi experiment section prison cell glass front face inward privaci high agenda citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show would dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz mess around first episod ever saw struck nasti surreal could say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort viewingthat get touch darker 

In [49]:
movie_re['review_transformed'][0]

'one reviewers mentioned watching  oz episode hooked right exactly happened first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use word called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far away would say main appeal show due fact goes shows would dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz mess around first episode ever saw struck nasty surreal could say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards sold nickel inmates kill order get away well mannered middle class inmates turned prison bitches due la