In [None]:
import pandas as pd
import numpy as np
import seaborn as sn

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/My Drive/datasets_files/IMDB_Dataset.csv')

In [None]:
df.shape

(50000, 2)

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df["review"][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [None]:
df["review"][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

**changing into lowercase**

In [None]:
df["review"]=df["review"].str.lower()

In [None]:
df["review"]

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

**Removing HTML tags**

In [None]:
# using regex
import re
def remove_html_tags(text):
  pattern = re.compile("<.*?>")
  return pattern.sub(r'',text)

In [None]:
text = "<html><body><p> Movie </p><p> actor</p> <h1>Prabhas</h1></body></html>"

In [None]:
remove_html_tags(text)

' Movie  actor Prabhas'

In [None]:
df["review"]=df["review"].apply(remove_html_tags)

In [None]:
df["review"][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

**Remove URL's**

In [None]:
text1 = "check out my https://dialogflow.cloud.google.com/"
text2 = "check google www.google.com"

In [None]:
def remove_url(text):
  pattern = re.compile(r'https?://\S+|www\.\S+')
  return pattern.sub(r'',text)

In [None]:
remove_url(text1)

'check out my '

In [None]:
remove_url(text2)

'check google '

**Removing Punctuation**

In [None]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
exclude = string.punctuation

In [None]:
# it is very slow
def remove_punc(text):
  for char in exclude:
    text = text.replace(char,'')
  return text

In [None]:
text = "string. with.- punctuation?"

In [None]:
text

'string. with.- punctuation?'

In [None]:
import time

In [None]:
start = time.time()
remove_punc(text)
time1 = time.time()-start
print(time1)

9.441375732421875e-05


In [None]:
print(time1*500000)

47.206878662109375


In [None]:
# another technique
def remove_punc1(text):
  return text.translate(str.maketrans('','',exclude))

In [None]:
start = time.time()
remove_punc1(text)
time2 = time.time()-start
print(time2)

6.29425048828125e-05


In [None]:
print(time2*500000)

31.47125244140625


**chat word treatment**

In [None]:
# like gm,hlo,hi,dm,gn
chat_words = {
    "LOL": "Laughing Out Loud",
    "BRB": "Be Right Back",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "ASAP": "As Soon As Possible",
    "TBH": "To Be Honest",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "WTF": "What The F***",
    "AFK": "Away From Keyboard",
    "FYI": "For Your Information",
    "IRL": "In Real Life",
    "BTW": "By The Way",
    "G2G": "Got To Go",
    "ROFL": "Rolling On the Floor Laughing",
    "NVM": "Never Mind",
    "ICYMI": "In Case You Missed It",
    "JK": "Just Kidding",
    "LMAO": "Laughing My A** Off",
    "OOTD": "Outfit Of The Day",
    "TGIF": "Thank God It's Friday",
    "YOLO": "You Only Live Once",
    "SMH": "Shaking My Head",
    "NSFW": "Not Safe For Work",
    "ICYMI": "In Case You Missed It",
    "HBD": "Happy Birthday",
    "IMHO": "In My Humble Opinion",
    "BFF": "Best Friends Forever",
    "NBD": "No Big Deal",
    "PPL": "People",
    "TTYL": "Talk To You Later",
    "IKR": "I Know Right",
    "IRL": "In Real Life",
    "CBA": "Can't Be Arsed",
    "FOMO": "Fear Of Missing Out",
    "KYS": "Kill Yourself",
    "TY": "Thank You",
    "WCW": "Woman Crush Wednesday",
    "MCM": "Man Crush Monday",
    "AFAIK": "As Far As I Know",
    "SFW": "Safe For Work",
    "BRUH": "Brother",
    "GG": "Good Game",
    "SMH": "Shaking My Head",
    "TFW": "That Feeling When",
    "TBF": "To Be Fair",
    "OMW": "On My Way",
    "MFW": "My Face When"
}


In [None]:
def chat_conversion(text):
  new_text = []
  for w in text.split():
    if w.upper() in chat_words:
      new_text.append(chat_words[w.upper()])
    else:
      new_text.append(w)
  return " ".join(new_text)

In [None]:
chat_conversion("TY ra")

'Thank You ra'

In [None]:
chat_conversion("IMHO he is the best")

'In My Humble Opinion he is the best'

**spelling correction**

In [None]:
from textblob import TextBlob

In [None]:
incorrect_text = "ceeritain conditionas negetive impression"
textblb = TextBlob(incorrect_text)
textblb.correct().string

'certain conditions negative impression'

**remove stopwords**

In [None]:
# rempve stop words
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
def remove_stopwords(text):
  new_text=[]

  for word in text.split():
    if word in stopwords.words("english"):
      new_text.append('')
    else:
      new_text.append(word)
  x=new_text[:]
  new_text.clear()
  return " ".join(x)

In [None]:
remove_stopwords("the sun rises in east direction,so i am very happy")

' sun rises  east direction,so    happy'

In [None]:
df["review"]=df["review"].apply(remove_stopwords)

**Handling Emoji's**

In [None]:
import re

def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


In [None]:
remove_emojis("loved the film, it was 😀 😁 😂 🤣 😃 ")

'loved the film, it was      '

In [None]:
pip install emoji --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234926 sha256=7f2893be928828b89894ff207a194df428ac918037c861cb83e9c30326572521
  Stored in directory: /root/.cache/pip/wheels/02/3d/88/51a592b9ad17e7899126563698b4e3961983ebe85747228ba6
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.2.0


In [None]:
import emoji
print(emoji.demojize("java is 😋"))

java is :face_savoring_food:


**Tokenization**

In [None]:
# 1.Using split function
# word tokenization
sent1 = "i am in hyd"
sent1.split()

['i', 'am', 'in', 'hyd']

In [None]:
# sentence tokenization

sent2 = "i im in hyd.Tommorrow i will go to my home"
sent2.split('.')

['i im in hyd', 'Tommorrow i will go to my home']

In [None]:
# problems with split

sent3 = "i am going to hyd!!"
sent3.split()

['i', 'am', 'going', 'to', 'hyd!!']

In [None]:
sent4 = "I am in U.S"
sent4.split('.')

['I am in U', 'S']

In [None]:
# 2 regular expression
import re
sent3 = "I am in hyd"
tokens = re.findall("[\w']+",sent3)
tokens

['I', 'am', 'in', 'hyd']

In [None]:
text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit.
 Nunc et aliquam metus, sed pellentesque augue. Donec auctor nisl quis eros dictum,
  ac aliquam nisi venenatis. Fusce semper orci a urna aliquet, eu dignissim massa egestas.
  Sed tincidunt dolor sit amet nisl vestibulum, ac dapibus odio consequat.
  Etiam rutrum metus eu sapien pellentesque, a faucibus leo ultrices.
  Praesent et dui sed augue sagittis interdum a ac quam.
  Sed eget nisi ac augue maximus pretium nec in metus."""
sentences = re.compile('[.!?]').split(text)
sentences

['Lorem ipsum dolor sit amet, consectetur adipiscing elit',
 '\n Nunc et aliquam metus, sed pellentesque augue',
 ' Donec auctor nisl quis eros dictum,\n  ac aliquam nisi venenatis',
 ' Fusce semper orci a urna aliquet, eu dignissim massa egestas',
 ' \n  Sed tincidunt dolor sit amet nisl vestibulum, ac dapibus odio consequat',
 ' \n  Etiam rutrum metus eu sapien pellentesque, a faucibus leo ultrices',
 ' \n  Praesent et dui sed augue sagittis interdum a ac quam',
 '\n Sed eget nisi ac augue maximus pretium nec in metus',
 '']

In [None]:
# NLTK
from nltk.tokenize import word_tokenize,sent_tokenize

In [None]:
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> l
Packages:
  [ ] abc................. Australian Broadcasting Commission 2006
  [ ] alpino.............. Alpino Dutch Treebank
  [ ] averaged_perceptron_tagger Averaged Perceptron Tagger
  [ ] averaged_perceptron_tagger_ru Averaged Perceptron Tagger (Russian)
  [ ] basque_grammars..... Grammars for Basque
  [ ] bcp47............... BCP-47 Language Tags
  [ ] biocreative_ppi..... BioCreAtIvE (Critical Assessment of Information
                           Extraction Systems in Biology)
  [ ] bllip_wsj_no_aux.... BLLIP Parser: WSJ Model
  [ ] book_grammars....... Grammars from NLTK Book
  [ ] brown............... Brown Corpus
  [ ] brown_tei........... Brown Corpus (TEI XML Version)


True

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
sent1 = "i am going to visit delhi!"
word_tokenize(sent1)

['i', 'am', 'going', 'to', 'visit', 'delhi', '!']

In [None]:
sent5 = "I have a Ph.D in A.I"

In [None]:
word_tokenize(sent5)

['I', 'have', 'a', 'Ph.D', 'in', 'A.I']

In [None]:
sent6 = "we're here to help! mail us at pavan@gmail.com"

In [None]:
word_tokenize(sent6)

['we',
 "'re",
 'here',
 'to',
 'help',
 '!',
 'mail',
 'us',
 'at',
 'pavan',
 '@',
 'gmail.com']

In [None]:
sent7 = "A 5km ride cost $10.50"

In [None]:
word_tokenize(sent7)

['A', '5km', 'ride', 'cost', '$', '10.50']

In [None]:
# Spacy
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
doc1 = nlp(sent5)
doc2 = nlp(sent6)
doc3 = nlp(sent7)
doc4 = nlp(sent1)

In [None]:
for token in doc1:
  print(token)

I
have
a
Ph
.
D
in
A.I


In [None]:
for token in doc2:
  print(token)

we
're
here
to
help
!
mail
us
at
pavan@gmail.com


In [None]:
for token in doc3:
  print(token)

A
5
km
ride
cost
$
10.50


In [None]:
for token in doc4:
  print(token)

i
am
going
to
visit
delhi
!


**Stemming**

In [None]:
# bring back to root form eg. play,played,playing -------->play

from nltk.stem.porter import PorterStemmer

In [None]:
ps = PorterStemmer()
def stem_words(text):
  return " ".join([ps.stem(word) for word in text.split()])

In [None]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

**Lemmatization**

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time.He has bad habit of swimming after playing long hours in the sun"
punctuations = "?:!.,;"
sentence_words = nltk.word_tokenize(sentence)

for word in sentence_words:
  if word in punctuations:
    sentence_words.remove(word)

print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
  print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word,pos='v'))) #pos -->parts of speech , here "v"-->verb  :here specifying verb to lemmatize

