In [30]:
import pandas as pd
df = pd.read_csv('IMDB Dataset.csv')
df.shape

(50000, 2)

In [31]:
df = df.head(1000)
df.shape

(1000, 2)

In [32]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Lower case

In [33]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [34]:
df['review'] = df['review'].str.lower()
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
995,nothing is sacred. just ask ernie fosselius. t...,positive
996,i hated it. i hate self-aware pretentious inan...,negative
997,i usually try to be professional and construct...,negative
998,if you like me is going to see this in a film ...,negative


# Remove HTML Tags

In [35]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [36]:
text = "<html><body><p> Movie 1</p><p> Actor </p><p> Click here to see <a href='http://google.com>"

In [37]:
remove_html_tags(text)

' Movie 1 Actor  Click here to see '

In [38]:
df['review'] = df['review'].apply(remove_html_tags)

In [39]:
df['review'][5]

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

# Remove URL

In [40]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [41]:
text1 = 'Check out Youtube https://www.youtube.com'
text2 = 'Check out Linkedin https://www.linkedin.com'
text3 = 'Check out GitHub https://www.github.com'

In [42]:
remove_url(text1)

'Check out Youtube '

# Remove Punctuation

In [43]:
import string,time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [44]:
exclude = string.punctuation

In [45]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

In [46]:
text = 'string. With. Punctuation?'

In [47]:
start = time.time()
print(remove_punc(text))
time1 = time.time() - start
print(time1*50000)

string With Punctuation
0.0


In [48]:
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [49]:
start = time.time()
remove_punc1(text)
time2 = time.time() - start
print(time2*50000)

49.948692321777344


In [50]:
time1/time2

0.0

In [51]:
df['review'][5]

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

In [52]:
remove_punc1(df['review'][5])

'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'

# Chat Conversion

In [53]:
chat_words = {
    'AFAIK' : 'As Far As I Know',
    'AFK' : 'Away From Keyboard',
    'ASAP' : 'As Soon As Possible'
}

In [54]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [55]:
chat_conversion('Do this work ASAP')

'Do this work As Soon As Possible'

# Incorrect Text Handling

In [56]:
from textblob import TextBlob

In [57]:
incorrect_text = 'cerrtain conditionas duringg sevral geneerations aree moodified in the samme maner'
textBlb = TextBlob(incorrect_text)
textBlb.correct().string

'certain conditions during several generations are modified in the same manner'

# Stop Words

In [58]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [59]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [60]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)

    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [61]:
remove_stopwords('Stop words are words that are very common, appear in almost every document, and have no discrimination value')

'Stop words  words    common, appear  almost every document,    discrimination value'

In [62]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [64]:
df['review'].apply(remove_stopwords)

0      one    reviewers  mentioned   watching  1 oz e...
1       wonderful little production.  filming techniq...
2       thought    wonderful way  spend time    hot s...
3      basically there's  family   little boy (jake) ...
4      petter mattei's "love   time  money"   visuall...
                             ...                        
995    nothing  sacred.  ask ernie fosselius.  days, ...
996     hated it.  hate self-aware pretentious inanit...
997     usually try   professional  constructive   cr...
998      like   going  see    film history class  som...
999      like  zoology textbook, given   depiction  a...
Name: review, Length: 1000, dtype: object

# Remove Emoji Handle

In [65]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F" # emoticons
        u"\U0001F300-\U0001F5FF" # symbols & pictographs
        u"\U0001F680-\U0001F6FF" # transport & map symbols
        u"\U0001F1E0-\U0001F1FF" # flags (ios)
        u"\U00002702-\U000027B0" 
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )

    return emoji_pattern.sub(r'', text)

In [66]:
remove_emoji("Loved the movie. It was 😘😘")

'Loved the movie. It was '

In [67]:
remove_emoji("Lmao 😂😂😁")

'Lmao '

In [69]:
import emoji
print(emoji.demojize('Python is 🔥'))

Python is :fire:


In [76]:
print(emoji.demojize('Loved the movie. It was 😍'))

Loved the movie. It was :smiling_face_with_heart-eyes:


# Tokenization

using Split function

In [77]:
# word tokenization
sent1 = 'I am going to Turkey'
sent1.split()

['I', 'am', 'going', 'to', 'Turkey']

In [78]:
# sentence tokenization
sent2 = 'I am going to Turkey. I will stay there for 3 days. Let\'s hope the trip to be great'
sent2.split('.')

['I am going to Turkey',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

In [79]:
# problems with split function
sent3 = 'I am going to Turkey!'
sent3.split()

['I', 'am', 'going', 'to', 'Turkey!']

In [80]:
sent4 = 'Where do think I should go? I have 3 day holiday'
sent4.split('.')

['Where do think I should go? I have 3 day holiday']

Regular Expression

In [81]:
import re
sent3 = 'I am going to Turkey!'
tokens = re.findall("[\w]+", sent3)
tokens

  tokens = re.findall("[\w]+", sent3)


['I', 'am', 'going', 'to', 'Turkey']

In [82]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
 Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
 when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sentences = re.compile('[.!?] ').split(text)
sentences

["Lorem Ipsum is simply dummy text of the printing and typesetting industry?\n Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, \n when an unknown printer took a galley of type and scrambled it to make a type specimen book."]

NLTK

In [83]:
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [95]:
sent1 = 'I am going to Turkey!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'Turkey', '!']

In [98]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
 Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
 when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sent_tokenize(text)

['Lorem Ipsum is simply dummy text of the printing and typesetting industry?',
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, \n when an unknown printer took a galley of type and scrambled it to make a type specimen book."]

In [96]:
sent5 = 'I have a M.Sc in CS'
sent6 = "We're here to help! mail us abc@gmail.com"
sent7 = 'A 5km ride cost $10.50'

word_tokenize(sent5)

['I', 'have', 'a', 'M.Sc', 'in', 'CS']

In [97]:
word_tokenize(sent6)

['We', "'re", 'here', 'to', 'help', '!', 'mail', 'us', 'abc', '@', 'gmail.com']

Spacy (good)

In [91]:
import spacy
from spacy.cli import download
download('en_core_web_sm')
nlp = spacy.load('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [99]:
doc1 = nlp(sent5)
doc2 = nlp(sent6)
doc3 = nlp(sent7)
doc4 = nlp(sent1)

In [100]:
doc4 = nlp(sent1)
doc4

I am going to Turkey!

In [101]:
for token in doc4:
    print(token)

I
am
going
to
Turkey
!


# Stemmer

In [102]:
from nltk.stem.porter import PorterStemmer

In [103]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [104]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [105]:
text = 'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'
print(text)

probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie


In [106]:
stem_words(text)

'probabl my alltim favorit movi a stori of selfless sacrific and dedic to a nobl caus but it not preachi or bore it just never get old despit my have seen it some 15 or more time in the last 25 year paul luka perform bring tear to my eye and bett davi in one of her veri few truli sympathet role is a delight the kid are as grandma say more like dressedup midget than children but that onli make them more fun to watch and the mother slow awaken to what happen in the world and under her own roof is believ and startl if i had a dozen thumb theyd all be up for thi movi'

# Lemmatization

In [107]:
import nltk
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos='v')))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...


Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 
