In [15]:
import pandas as pd
import re

In [16]:
df = pd.read_csv("IMDB Dataset.csv")


In [17]:
df.shape

(50000, 2)

In [18]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Lowercasing

In [19]:
# particular cell
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [20]:
# whole colum
df['review'].str.lower()

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

Remove HTML tags

In [21]:
# function for removing the HTML tags
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('',data)

we have to assign it back to permanently apply the preprocessing steps

df['review'] = df['review'].apply(striphtml)

In [48]:
df['review'].apply(striphtml)

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. The filming tec...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [46]:
df['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

Remove URLs

In [23]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [24]:
df['review'].apply(remove_url)

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

Remove Punctuation

In [None]:
import string, time

In [None]:
# list of punctuations, we can remove any if we want
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [28]:
exclude = string.punctuation

In [None]:
# this function is quite slow so we won't be using this for removing punctuation

def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

In [30]:
text = "string. with. punctuation?"

In [None]:
# we can see the time required, but it can become large for large data files 
start = time.time()
print(remove_punc(text))
time1 = time.time() - start
print(time1)

string with punctuation
0.00013518333435058594


In [None]:
# So we'll use an alternate method to remove the punctuations from the data set
# this method is 18 times faster then the previous one

def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [36]:
df['review'].apply(remove_punc1)

0        One of the other reviewers has mentioned that ...
1        A wonderful little production br br The filmin...
2        I thought this was a wonderful way to spend ti...
3        Basically theres a family where a little boy J...
4        Petter Matteis Love in the Time of Money is a ...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot bad dialogue bad acting idiotic direc...
49997    I am a Catholic taught in parochial elementary...
49998    Im going to have to disagree with the previous...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

chat word treatment, we can write the full form of common used abbrevations

In [43]:
# I found this data online

chat_abbreviations = {
    "LOL": "Laughing Out Loud",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "IDK": "I Don't Know",
    "IMO": "In My Opinion",
    "IMHO": "In My Humble Opinion",
    "TTYL": "Talk To You Later",
    "FYI": "For Your Information",
    "OMG": "Oh My God",
    "BFF": "Best Friends Forever",
    "GTG": "Got To Go",
    "G2G": "Got To Go",
    "LMAO": "Laughing My A** Off",
    "ROFL": "Rolling On the Floor Laughing",
    "SMH": "Shaking My Head",
    "TBH": "To Be Honest",
    "NP": "No Problem",
    "DM": "Direct Message",
    "TY": "Thank You",
    "THX": "Thanks",
    "NVM": "Never Mind",
    "OMW": "On My Way",
    "AFK": "Away From Keyboard",
    "TMI": "Too Much Information",
    "YOLO": "You Only Live Once",
    "FOMO": "Fear Of Missing Out",
    "IDC": "I Don't Care",
    "IKR": "I Know, Right?",
    "BAE": "Before Anyone Else",
    "ASAP": "As Soon As Possible",
    "JK": "Just Kidding"
}

In [41]:
def chat_conversation(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_abbreviations:
            new_text.append(chat_abbreviations[w.upper()])
        else:
            new_text.append(w)
    
    return " ".join(new_text)

In [42]:
chat_conversation("JK I know IDC ASAP")

"Just Kidding I know I Don't Care As Soon As Possible"

In [44]:
df['review'].apply(chat_conversation)

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

Correcting Spelling mistakes

In [49]:
from textblob import TextBlob

  from scipy.stats import fisher_exact


In [None]:
# we can use the following code for it, but I am commenting it because it is taking time to run

# df['review'].apply(lambda x: str(TextBlob(x).correct()))


Stemming (getting first form of the verb)

In [54]:
import nltk
from nltk.stem.porter import PorterStemmer

In [57]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [58]:
sample = "walk walk walking walked"
stem_words(sample)

'walk walk walk walk'

Lemmatization

In [74]:
# tokenizing word using spacy

import spacy

nlp = spacy.load("en_core_web_sm")

text = "He was running to save his life and then He met with someone very dangerouns who could even kill him"
doc = nlp(text)

words = [token.text for token in doc if not token.is_space]
print("Words:", words)

Words: ['He', 'was', 'running', 'to', 'save', 'his', 'life', 'and', 'then', 'He', 'met', 'with', 'someone', 'very', 'dangerouns', 'who', 'could', 'even', 'kill', 'him']


In [76]:
# performing lemmatization 

import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)
lemmatized = [token.lemma_ for token in doc]
print(lemmatized)

['he', 'be', 'run', 'to', 'save', 'his', 'life', 'and', 'then', 'he', 'meet', 'with', 'someone', 'very', 'dangeroun', 'who', 'could', 'even', 'kill', 'he']
