In [1]:
import pandas as pd
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## 1 - Lowercasing

In [2]:
df['review'][3].lower() # lowercasing 3rd row of review column

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [3]:
df['review'] = df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


## 2 - Remove HTML Tags

In [4]:
import re # regular expression
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)

In [5]:
df['review'] = df['review'].apply(remove_html_tags)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


## 3 - Remove URLs 

In [6]:
text1 = 'Check out my notebook www.google.com https://www.kaggle.com/tutor/python'

In [7]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)
    
                         

In [8]:
remove_url(text1)

'Check out my notebook  '

## 4 - Remove Punctuations

In [9]:
import string
import time
print(string.punctuation)
exclude = string.punctuation

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [10]:
# This method takes more time if we have large data. So this method is only good for small data.
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

In [11]:
text = 'You,are.not;good:'
print(remove_punc(text))

Youarenotgood


In [12]:
print(df['review'].apply(remove_punc))
start = time.time()
time1 = time.time() - start
print(time1)

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object
0.0


In [13]:
# This method is good for large data.
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))

In [14]:
print(df['review'].apply(remove_punc1))

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object


## 5 -  Chat Word Treatment

In [15]:
chat_words = {'AFAIK':'As Far As I Know',
'AFK': 'Away From Keyboard',
'ASAP':'As Soon As Possible',
'ATK': 'At The Keyboard',
'ATM':'At The Moment',
'A3': 'Anytime, Anywhere, Anyplace',
'BAK':'Back At Keyboard',
'BBL':'Be Back Later',
'BBS':'Be Back Soon',
'BFN':'Bye For Now',
'B4N':'Bye For Now',
'BRB':'Be Right Back',
'BRT':'Be Right There',
'BTW':'By The Way',
'B4':'Before',
'B4N':'Bye For Now',
'CU':'See You',
'CUL8R':'See You Later',
'CYA':'See You',
'FAQ':'Frequently Asked Questions',
'FC':'Fingers Crossed',
'FWIW':'For What It is Worth',
'FYI':'For Your Information',
'GAL':'Get a Life',
'GG':'Good Game',
'GN':'Good Night',
'GMTA':'Great Minds Think Alike',
'GR8':'Great!',
'G9':'Genius',
'IC':'I See',
'ICQ':'I Seek you (also a chat program)',
'ILU':'I Love You',
'IMHO':'In My Honest/Humble Opinion',
'IMO':'In My Opinion',
'IOW':'In Other Words',
'IRL':'In Real Life',
'KISS':'Keep It Simple, Stupid',
'LDR':'Long Distance Relationship',
'LMAO':'Laugh My A.. Off',
'LOL':'Laughing Out Loud',
'LTNS':'Long Time No See',
'L8R':'Later',
'MTE':'My Thoughts Exactly',
'M8':'Mate',
'NRN':'No Reply Necessary',
'OIC':'Oh I See',
'PITA':'Pain In The A..',
'PRT':'Party',
'PRW':'Parents Are Watching',
'QPSA?':'Que Pasa?',
'ROFL':'Rolling On The Floor Laughing',
'ROFLOL':'Rolling On The Floor Laughing Out Loud',
'ROTFLMAO':'Rolling On The Floor Laughing My A.. Off',
'SK8':'Skate',
'STATS':'Your sex and age',
'ASL':'Age, Sex, Location',
'THX':'Thank You',
'TTFN':'Ta-Ta For Now!',
'TTYL':'Talk To You Later',
'U':'You',
'U2':'You Too',
'U4E':'Yours For Ever',
'WB':'Welcome Back',
'WTF':'What The F...',
'WTG':'Way To Go!',
'WUF':'Where Are You From?',
'W8':'Wait...',
'7K':'Sick:-D Laugher'}

In [16]:
chat_words

{'AFAIK': 'As Far As I Know',
 'AFK': 'Away From Keyboard',
 'ASAP': 'As Soon As Possible',
 'ATK': 'At The Keyboard',
 'ATM': 'At The Moment',
 'A3': 'Anytime, Anywhere, Anyplace',
 'BAK': 'Back At Keyboard',
 'BBL': 'Be Back Later',
 'BBS': 'Be Back Soon',
 'BFN': 'Bye For Now',
 'B4N': 'Bye For Now',
 'BRB': 'Be Right Back',
 'BRT': 'Be Right There',
 'BTW': 'By The Way',
 'B4': 'Before',
 'CU': 'See You',
 'CUL8R': 'See You Later',
 'CYA': 'See You',
 'FAQ': 'Frequently Asked Questions',
 'FC': 'Fingers Crossed',
 'FWIW': 'For What It is Worth',
 'FYI': 'For Your Information',
 'GAL': 'Get a Life',
 'GG': 'Good Game',
 'GN': 'Good Night',
 'GMTA': 'Great Minds Think Alike',
 'GR8': 'Great!',
 'G9': 'Genius',
 'IC': 'I See',
 'ICQ': 'I Seek you (also a chat program)',
 'ILU': 'I Love You',
 'IMHO': 'In My Honest/Humble Opinion',
 'IMO': 'In My Opinion',
 'IOW': 'In Other Words',
 'IRL': 'In Real Life',
 'KISS': 'Keep It Simple, Stupid',
 'LDR': 'Long Distance Relationship',
 'LMAO':

In [17]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [18]:
chat_conversion('FAQ : What you want to do?')

'Frequently Asked Questions : What you want to do?'

## 6 - Spelling Correction

In [19]:
from textblob import TextBlob

In [20]:
incorrect_text = 'Certan condishons are not favorabal here'
text = TextBlob(incorrect_text)
text.correct().string

'Certain conditions are not favorable here'

In [21]:
def correct_text(text):
    text = TextBlob(text)
    text.correct().string

In [22]:
 # df['review']=df['review'].apply(correct_text)

## 7 - Removing Stop Words

<b>The Words which help in only sentence formation but not in sentence meaning. <br>
    Examples: a, the, of, are, is, my, etc. <br>
    In Parts of Speech tagging, we don't remove stop words.

In [23]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [25]:
def remove_stopwords(text):
    new_text = []
    for word in text.split(): # breaking text into words
        if word in stopwords.words('english'): # Checking whether word is in English language
            new_text.append(' ') # Then we will replace empty in new_text in the place of that word
        else:
            new_text.append(word) # if that is not stopword, then add word in new_text list 
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [26]:
remove_stopwords('probably my all issues are resolved')

'probably     issues   resolved'

In [27]:
# df['review'] = df['review'].apply(remove_stopwords)

## 7 - Handling Emojis

In [28]:
import re
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emotion
                               u"\U0001F300-\U0001F5FF"  # symbols and pictographs
                               u"\U0001F680-\U0001F6FF"  # transport and map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (ios)
                               u"\U00002702-\U00002780"
                               u"\U000024C2-\U0001F251"
                               "]+",flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

In [29]:
remove_emojis("You are smily.😃")

'You are smily.'

In [30]:
import emoji
print(emoji.demojize('Python is 😃'))

Python is :grinning_face_with_big_eyes:


## 8 - Tokenization

<b> Breaking text into smaller parts (tokens). Smaller parts can be words, sentences, phrases.

### i - Using Split Function

In [31]:
# Word Tokenization
sent1 = 'I am going to Lahore'
sent1.split() # Splitting on the basis of spaces

['I', 'am', 'going', 'to', 'Lahore']

In [32]:
# Sentence Tokenization
sent2 = 'I am going to Lahore. You are going to Karachi'
sent2.split('.') # Splitting on the basis of full stop

['I am going to Lahore', ' You are going to Karachi']

In [33]:
# Problem Faced by Split Function
sent3 = 'You are very good!'
print(sent3.split())

sent4 = 'I am good person'
print(sent4.split())

['You', 'are', 'very', 'good!']
['I', 'am', 'good', 'person']


<b> The above result shows that good and good! are two different words

In [34]:
sent = "I live in Lahore.You live in Islamabad. What do u do? I am farmer"
sent.split('.')

['I live in Lahore', 'You live in Islamabad', ' What do u do? I am farmer']

<b> The above result shows that sentences have not been tokenized by '?'

### ii-Using Regular Expression

In [35]:
import re
sent = 'I am going to Delhi!'
tokens = re.findall("[\w']+", sent)
tokens

['I', 'am', 'going', 'to', 'Delhi']

In [36]:
# Splitting on the basis of your choice
text1 = 'I am good.You are not good! Where do u live? I live in Lahore: I eat a lot of vegetables; You eat some' 
sentences = re.compile('[.?;]').split(text1)
sentences

['I am good',
 'You are not good! Where do u live',
 ' I live in Lahore: I eat a lot of vegetables',
 ' You eat some']

### iii-Using NLTK Library

In [37]:
# Not perfect 100% result. But better results
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [38]:
text = 'You are good. I am also good. Where do u live? I live in Lahore: I live in Karachi; But you live in Islamabad.'
sent_tokenize(text)

['You are good.',
 'I am also good.',
 'Where do u live?',
 'I live in Lahore: I live in Karachi; But you live in Islamabad.']

In [39]:
text1 = 'I have a Ph.D in A.i'
text2 = "We're here to help! Mail us at 123@gmail.com"
text3 = 'A 5km ride cost $20.5'
print(word_tokenize(text1))
print(word_tokenize(text2))
print(word_tokenize(text3))

['I', 'have', 'a', 'Ph.D', 'in', 'A.i']
['We', "'re", 'here', 'to', 'help', '!', 'Mail', 'us', 'at', '123', '@', 'gmail.com']
['A', '5km', 'ride', 'cost', '$', '20.5']


### iv-Spacy

In [40]:
#Better than NLTK Library
!pip install spacy
import spacy





In [41]:
nlp = spacy.load('en_core_web_sm')

In [42]:
# converting sentences into documents
doc1=nlp(text1)
doc2=nlp(text2)
doc3=nlp(text3)
doc4=nlp(text)

In [43]:
for token in doc3:
    print(token)

A
5
km
ride
cost
$
20.5


In [44]:
for token in doc4:
    print(token)

You
are
good
.
I
am
also
good
.
Where
do
u
live
?
I
live
in
Lahore
:
I
live
in
Karachi
;
But
you
live
in
Islamabad
.


## 9 - Stemming

<b> To retrieve root word. Mostly used in Information Retrieval Systems.Multiple stemmers are used.<br>
    i - Porter Stemmer - For English Language<br>
    ii - snowball stemmer - For Other Languages

### i - Porter Stemmer

In [45]:
# When to show user output, then lemmatization is better. Output of Lemmatization will always be English language word

In [46]:
from nltk.stem.porter import PorterStemmer

In [47]:
ps = PorterStemmer()

def stem_words(text):
    return " " .join([ps.stem(word) for word in text.split()])

In [48]:
text1 = 'Walking is good, He has walked, walk and walks'
stem_words(text1)

'walk is good, he ha walked, walk and walk'

In [49]:
# df['review'].apply(stem_words)

In [50]:
text2 = 'Smoking is dangerous probably due to its starting university time'
stem_words(text2)

'smoke is danger probabl due to it start univers time'

## 10 - Lemmatization

<b> We use WordNet Lemmatizer

In [51]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = 'Smoking is dangerous probably due to its starting university time'
punctuations = "?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)
sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word)))

Word                Lemma               
Smoking             Smoking             
is                  is                  
dangerous           dangerous           
probably            probably            
due                 due                 
to                  to                  
its                 it                  
starting            starting            
university          university          
time                time                


<b>Difference Between Stemming and Lemmatization<br>
Both work same - to retrive root word. <br>
 When to show user output, then lemmatization is suitable. Because output of Stemming is sometimes is not English Language word (e.g. Probabl). Output of Lemmatization will always be English language word.<br>
Lemmatization is slower than Stemming

In [52]:
import pandas as pd
import requests

In [53]:
response = requests.get('https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page=1')

In [54]:
response.json()['results']

[{'adult': False,
  'backdrop_path': '/rSPw7tgCH9c6NqICZef4kZjFOQ5.jpg',
  'genre_ids': [18, 80],
  'id': 238,
  'original_language': 'en',
  'original_title': 'The Godfather',
  'overview': 'Spanning the years 1945 to 1955, a chronicle of the fictional Italian-American Corleone crime family. When organized crime family patriarch, Vito Corleone barely survives an attempt on his life, his youngest son, Michael steps in to take care of the would-be killers, launching a campaign of bloody revenge.',
  'popularity': 92.658,
  'poster_path': '/3bhkrj58Vtu7enYsRolD1fZdja1.jpg',
  'release_date': '1972-03-14',
  'title': 'The Godfather',
  'video': False,
  'vote_average': 8.7,
  'vote_count': 16739},
 {'adult': False,
  'backdrop_path': '/kXfqcdQKsToO0OUXHcrrNCHDBzO.jpg',
  'genre_ids': [18, 80],
  'id': 278,
  'original_language': 'en',
  'original_title': 'The Shawshank Redemption',
  'overview': 'Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy D