In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("imdb-dataset.csv")
df.head(20)
df.shape

(50000, 2)

In [3]:
# change all texts to lowercase
df['review'] = df['review'].str.lower()
df.head(20)
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [4]:
# Remove all HTML tags
import re

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [5]:
df['review'] = df['review'].apply(remove_html_tags)
#df.head(10)
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [6]:
# remove all URLs
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [7]:
df['review'] = df['review'].apply(remove_url)
df.head(5)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [8]:
# Punctuation handling
import string, time

exclude = string.punctuation
print(exclude)

def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [9]:
#print(df['review'][3])
df['review'] = df['review'].apply(remove_punc)
#df['review'][3]

In [10]:
# Social media chat words
chat_words = {
    'FYI':'For Your Information',
    'ASAP': 'As Soon As Possible',
    'BRB': 'Be Right Back',
    'BTW':'By The Way',
    'OMG':'Oh My God',
    'IMO':'In My Opinion',
    'LOL':'Laugh Out Loud',
    'TTYL':'Talk To You Later',
    'GTG':'Got To Go',
    'TTYT':'Talk To You Tomorrow',
    'IDK':"I Don't Know",
    'TMI':'Too Much Information',
    'IMHO':'In My Humble Opinion',
    'ICYMI':'In Case You Missed It',
    'AFAIK':'As Far As I Know',
    'BTW':'By The Way',
    'FAQ':'Frequently Asked Questions',
    'TGIF':"Thank God It's Friday",
    'FYA':'For Your Action',
    'ICYMI':'In Case You Missed It',
}

In [11]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [12]:
# Example
chat_conversion('Do this work ASAP')

'Do this work As Soon As Possible'

In [13]:
df['review'] = df['review'].apply(chat_conversion)

In [14]:
# Incorrect text handling
from textblob import TextBlob

def correct_words(text):
    textBlb = TextBlob(text)
    return textBlb.correct().string

# example
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the same manner'
correct_words(incorrect_text)

'certain conditions during several generations are modified in the same manner'

In [15]:
# remove grammartically incorrect words
#df['review'] = df['review'].apply(correct_words)

In [16]:
from nltk.corpus import stopwords
import nltk

#nltk.download('stopwords')
len(stopwords.words('english'))

179

In [17]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

# example
remove_stopwords("probably my all-time favourite movie, a story of selflessness, sacrifice and dedication")

'probably  all-time favourite movie,  story  selflessness, sacrifice  dedication'

In [18]:
# remove stop words
#df['review'] = df['review'].apply(remove_stopwords)

**Remove Emoji handle**

In [19]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [20]:
remove_emoji('I love you😘')

'I love you'

In [21]:
remove_emoji('LMAO😀. This is a joke😒')

'LMAO. This is a joke'

<h1>Emoji Handling</h1>

In [22]:
pip install emoji  #interpret emoji meanings

Note: you may need to restart the kernel to use updated packages.


In [23]:
import emoji
print(emoji.demojize('Python is 🔥'))

Python is :fire:


<h1>TOKENIZATION</h1>

**1. Using split method**

In [25]:
# word tokenization
sentence1 = "I am going to be a billionaire"
sentence1.split()

['I', 'am', 'going', 'to', 'be', 'a', 'billionaire']

In [27]:
# sentence tokenization
sentence2 = "I am going to be a billionaire. Change peoples lives. See you at the top"
sentence2.split('.')

['I am going to be a billionaire',
 ' Change peoples lives',
 ' See you at the top']

**2. Using Regular Expression**

In [29]:
import re
sent3 = 'I am going to the moon!'
tokens = re.findall("[\w']+", sent3)
tokens

['I', 'am', 'going', 'to', 'the', 'moon']

In [30]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sentences = re.compile('[.!?] ').split(text)
sentences

["Lorem Ipsum is simply dummy text of the printing and typesetting industry?\nLorem Ipsum has been the industry's standard dummy text ever since the 1500s,\nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

**3. Using NLTK**

In [32]:
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
#nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True