In [116]:
#pip install --upgrade numpy

In [117]:
import pandas as pd

In [118]:
df = pd.read_csv("imdb-dataset.csv")
df.head(20)
df.shape

(50000, 2)

In [119]:
# change all texts to lowercase
df['review'] = df['review'].str.lower()
df.head(20)
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [120]:
# Remove all HTML tags
import re

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [121]:
df['review'] = df['review'].apply(remove_html_tags)
#df.head(10)
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [122]:
# remove all URLs
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [123]:
df['review'] = df['review'].apply(remove_url)
df.head(5)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


<h1>Punctuation handling</h1>

In [124]:
import string, time

exclude = string.punctuation
print(exclude)

def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [125]:
#print(df['review'][3])
df['review'] = df['review'].apply(remove_punc)
#df['review'][3]

In [126]:
# Social media chat words
chat_words = {
    'FYI':'For Your Information',
    'ASAP': 'As Soon As Possible',
    'BRB': 'Be Right Back',
    'BTW':'By The Way',
    'OMG':'Oh My God',
    'IMO':'In My Opinion',
    'LOL':'Laugh Out Loud',
    'TTYL':'Talk To You Later',
    'GTG':'Got To Go',
    'TTYT':'Talk To You Tomorrow',
    'IDK':"I Don't Know",
    'TMI':'Too Much Information',
    'IMHO':'In My Humble Opinion',
    'ICYMI':'In Case You Missed It',
    'AFAIK':'As Far As I Know',
    'BTW':'By The Way',
    'FAQ':'Frequently Asked Questions',
    'TGIF':"Thank God It's Friday",
    'FYA':'For Your Action',
    'ICYMI':'In Case You Missed It',
}

In [127]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [128]:
# Example
chat_conversion('Do this work ASAP')

'Do this work As Soon As Possible'

In [129]:
df['review'] = df['review'].apply(chat_conversion)

In [130]:
# Incorrect text handling
from textblob import TextBlob

def correct_words(text):
    textBlb = TextBlob(text)
    return textBlb.correct().string

# example
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the same manner'
correct_words(incorrect_text)

'certain conditions during several generations are modified in the same manner'

In [131]:
# remove grammartically incorrect words
#df['review'] = df['review'].apply(correct_words)

In [132]:
from nltk.corpus import stopwords
import nltk

#nltk.download('stopwords')
len(stopwords.words('english'))

179

In [133]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

# example
remove_stopwords("probably my all-time favourite movie, a story of selflessness, sacrifice and dedication")

'probably  all-time favourite movie,  story  selflessness, sacrifice  dedication'

In [134]:
# remove stop words
#df['review'] = df['review'].apply(remove_stopwords)

**Remove Emoji handle**

In [135]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [136]:
remove_emoji('I love you😘')

'I love you'

In [137]:
remove_emoji('LMAO😀. This is a joke😒')

'LMAO. This is a joke'

<h1>Emoji Handling</h1>

In [138]:
#pip install emoji  #interpret emoji meanings

In [139]:
# import emoji
# print(emoji.demojize('Python is 🔥'))

<h1>TOKENIZATION</h1>

**1. Using split method**

In [140]:
# word tokenization
sentence1 = "I am going to be a billionaire"
sentence1.split()

['I', 'am', 'going', 'to', 'be', 'a', 'billionaire']

In [141]:
# sentence tokenization
sentence2 = "I am going to be a billionaire. Change peoples lives. See you at the top"
sentence2.split('.')

['I am going to be a billionaire',
 ' Change peoples lives',
 ' See you at the top']

**2. Using Regular Expression**

In [142]:
import re
sent3 = 'I am going to the moon!'
tokens = re.findall("[\w']+", sent3)
tokens

  tokens = re.findall("[\w']+", sent3)


['I', 'am', 'going', 'to', 'the', 'moon']

In [143]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sentences = re.compile('[.!?] ').split(text)
sentences

["Lorem Ipsum is simply dummy text of the printing and typesetting industry?\nLorem Ipsum has been the industry's standard dummy text ever since the 1500s,\nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

**3. Using NLTK**

In [144]:
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [145]:
sent1 = 'I am going to visit delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'visit', 'delhi', '!']

In [146]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sent_tokenize(text)

['Lorem Ipsum is simply dummy text of the printing and typesetting industry?',
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,\nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

**4. Using Spacy**

In [147]:
#pip install spacy

In [148]:
#pip install blis

In [149]:
#import spacy
#nlp = spacy.load('en_core_web_sm')

In [150]:
# doc1 = nlp(sent5)
# doc2 = nlp(sent6)
# doc3 = nlp(sent7)
# doc4 = nlp(sent1)

<h1>Stemmer</h1>
<p>text preprocessing technique used to reduce words to their root or base form.</p>

In [151]:
from nltk.stem import PorterStemmer

In [152]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [153]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [154]:
text = 'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'
stem_words(text)

'probabl my alltim favorit movi a stori of selfless sacrific and dedic to a nobl caus but it not preachi or bore it just never get old despit my have seen it some 15 or more time in the last 25 year paul luka perform bring tear to my eye and bett davi in one of her veri few truli sympathet role is a delight the kid are as grandma say more like dressedup midget than children but that onli make them more fun to watch and the mother slow awaken to what happen in the world and under her own roof is believ and startl if i had a dozen thumb theyd all be up for thi movi'

<h1>Lemmatization</h1>
<p>Lemmatization is the process of grouping together different inflected forms of the same word. </p>

In [155]:
import nltk
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [156]:
sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos='v')))

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 


NOTE: Stemming & lamatization are same to retrieve root words but lamatization is worked good. Lamatization is slow & stemming is fast