In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
df = pd.read_csv('C:/Users/ADMIN/Desktop/python jupyter/PROJECTS/IMDB Movie Reviews/IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# LOWERCASING

In [4]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [5]:
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [6]:
df['review'] = df['review'].str.lower()

In [7]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


# REMOVING HTML TAGS

In [8]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)

In [9]:
text = "<p>The <strong>quick</strong> brown <em>fox</em> jumps over the <a href='https://lazy.dog'>lazy dog</a>.</p>"

In [10]:
remove_html_tags(text)

'The quick brown fox jumps over the lazy dog.'

In [11]:
df['review'] = df['review'].apply(remove_html_tags)

In [12]:
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

# REMOVING URLS

In [13]:
text1 = "For the latest news updates, visit https://www.bbc.com."
text2 = "Check out the new features of the latest iPhone on https://www.apple.com."
text3 = "You can find delicious recipes at https://www.allrecipes.com."
text4 = "To learn more about space exploration, go to https://www.nasa.gov."

In [14]:
def remove_urls(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

In [15]:
remove_urls(text1)

'For the latest news updates, visit '

In [16]:
remove_urls(text2)

'Check out the new features of the latest iPhone on '

# REMOVING PUNCTUATIONS

In [17]:
import string,time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
exclude = string.punctuation

In [19]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

In [20]:
text = 'String.  with. puncatuation?'

In [21]:
remove_punc(text)

'String  with puncatuation'

In [22]:
# this is comparatively faster
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))

In [23]:
remove_punc1(text)

'String  with puncatuation'

In [24]:
df1 = pd.read_csv('C:/Users/ADMIN/Desktop/python jupyter/PROJECTS/hate speech detection/tw.csv')

In [25]:
df1.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [26]:
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))

In [27]:
df1['tweet'].apply(remove_punc1)

0         user when a father is dysfunctional and is so...
1        user user thanks for lyft credit i cant use ca...
2                                      bihday your majesty
3        model   i love u take with u all the time in u...
4                     factsguide society now    motivation
                               ...                        
31957    ate user isz that youuuðððððð...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31960    user sikh temple vandalised in in calgary wso ...
31961                      thank you user for you follow  
Name: tweet, Length: 31962, dtype: object

# CHATWORD TREATMENT

In [28]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

In [29]:
lines = chat_words_str.strip().split('\n')
chat_words = {line.split('=')[0]: line.split('=')[1] for line in lines}
chat_words

{'AFAIK': 'As Far As I Know',
 'AFK': 'Away From Keyboard',
 'ASAP': 'As Soon As Possible',
 'ATK': 'At The Keyboard',
 'ATM': 'At The Moment',
 'A3': 'Anytime, Anywhere, Anyplace',
 'BAK': 'Back At Keyboard',
 'BBL': 'Be Back Later',
 'BBS': 'Be Back Soon',
 'BFN': 'Bye For Now',
 'B4N': 'Bye For Now',
 'BRB': 'Be Right Back',
 'BRT': 'Be Right There',
 'BTW': 'By The Way',
 'B4': 'Before',
 'CU': 'See You',
 'CUL8R': 'See You Later',
 'CYA': 'See You',
 'FAQ': 'Frequently Asked Questions',
 'FC': 'Fingers Crossed',
 'FWIW': "For What It's Worth",
 'FYI': 'For Your Information',
 'GAL': 'Get A Life',
 'GG': 'Good Game',
 'GN': 'Good Night',
 'GMTA': 'Great Minds Think Alike',
 'GR8': 'Great!',
 'G9': 'Genius',
 'IC': 'I See',
 'ICQ': 'I Seek you (also a chat program)',
 'ILU': 'ILU: I Love You',
 'IMHO': 'In My Honest/Humble Opinion',
 'IMO': 'In My Opinion',
 'IOW': 'In Other Words',
 'IRL': 'In Real Life',
 'KISS': 'Keep It Simple, Stupid',
 'LDR': 'Long Distance Relationship',
 'LM

In [30]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [31]:
chat_conversion('IMHO is the best')

'In My Honest/Humble Opinion is the best'

In [32]:
chat_conversion('FYI is the capital of delhi')

'For Your Information is the capital of delhi'

# SPELLING CORRECTION

In [33]:
from textblob import TextBlob

In [34]:
incorrect = "I red the book yesterdy and it was amazzing!"
tb = TextBlob(incorrect)
tb.correct().string

'I red the book yesterday and it was amazing!'

# REMOVING STOPWORDS

In [35]:
from nltk.corpus import stopwords

In [36]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [37]:
def remove_stopwords(text):
    new_text = []

    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [38]:
remove_stopwords('The quick brown fox jumps over the lazy dog, demonstrating agility and laziness in stark contrast')

'The quick brown fox jumps   lazy dog, demonstrating agility  laziness  stark contrast'

In [39]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [40]:
# df['review'].apply(remove_stopwords)

# HANDLING EMOJIS

In [41]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

In [42]:
remove_emoji("Hello! 😀 How are you? 🌟")

'Hello!  How are you? '

In [44]:
import emoji
print(emoji.demojize('Python is 🌟'))

Python is :glowing_star:


# Tokeniztion

## 1.split function

In [45]:
sent1 = 'I am going to delhi'
sent1.split()

['I', 'am', 'going', 'to', 'delhi']

In [47]:
sent2 = 'I am going to delhi. I will stay there for 3 days'
sent2.split('.')

['I am going to delhi', ' I will stay there for 3 days']

In [48]:
sent3 = 'I am going to delhi!'
sent3.split()

['I', 'am', 'going', 'to', 'delhi!']

## 2. regex

In [49]:
sent3 = 'I am going to delhi!'
tokens = re.findall("[\w']+",sent3)
tokens

  tokens = re.findall("[\w']+",sent3)


['I', 'am', 'going', 'to', 'delhi']

## 3. NLTK

In [50]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [52]:
sent3 = 'I am going to visit delhi!'
word_tokenize(sent3)

['I', 'am', 'going', 'to', 'visit', 'delhi', '!']

In [53]:
text = '''After a long day at work, I decided to unwind with some music; however, the loud construction nearby made it nearly impossible to relax. Frustrated, I took refuge in a nearby café, where the aroma of freshly brewed coffee and the soft jazz playing in the background finally calmed my senses. 
As I sipped my latte, I reflected on the day's challenges and looked forward to a peaceful evening ahead!'''

In [54]:
sent_tokenize(text)

['After a long day at work, I decided to unwind with some music; however, the loud construction nearby made it nearly impossible to relax.',
 'Frustrated, I took refuge in a nearby café, where the aroma of freshly brewed coffee and the soft jazz playing in the background finally calmed my senses.',
 "As I sipped my latte, I reflected on the day's challenges and looked forward to a peaceful evening ahead!"]

In [56]:
sent5 = 'I have a Ph.D in A.I'
sent6 = "We're here to help ! mail us at nks@gmail.com"
sent7 = 'A 5km ride cost us $10.50'

In [58]:
print(word_tokenize(sent5))
print(word_tokenize(sent6))
print(word_tokenize(sent7))

['I', 'have', 'a', 'Ph.D', 'in', 'A.I']
['We', "'re", 'here', 'to', 'help', '!', 'mail', 'us', 'at', 'nks', '@', 'gmail.com']
['A', '5km', 'ride', 'cost', 'us', '$', '10.50']


## 4. spacy

In [55]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [59]:
doc1 = nlp(sent5)
doc2 = nlp(sent6)
doc3 = nlp(sent7)
doc4 = nlp(sent1)

In [63]:
for token in doc3:
    print(token)

A
5
km
ride
cost
us
$
10.50


# STEMMING : reducing the word to its root word

In [64]:
from nltk.stem.porter import PorterStemmer

In [65]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [66]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [67]:
text = '''After a long day at work, I decided to unwind with some music; however, the loud construction nearby made it nearly impossible to relax. Frustrated, I took refuge in a nearby café, where the aroma of freshly brewed coffee and the soft jazz playing in the background finally calmed my senses. 
As I sipped my latte, I reflected on the day's challenges and looked forward to a peaceful evening ahead!'''

In [68]:
stem_words(text)

"after a long day at work, i decid to unwind with some music; however, the loud construct nearbi made it nearli imposs to relax. frustrated, i took refug in a nearbi café, where the aroma of freshli brew coffe and the soft jazz play in the background final calm my senses. as i sip my latte, i reflect on the day' challeng and look forward to a peac even ahead!"

# lemmatization : it ensures that the root word belongs to the language

In [71]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

sentence = 'After a long day at work, I decided to unwind with some music; however, the loud construction nearby made it nearly impossible to relax.'
punctuations = "?:!.,;"
words = word_tokenize(sentence)
for word in words:
    if word in punctuations:
        words.remove(word)
words

['After',
 'a',
 'long',
 'day',
 'at',
 'work',
 'I',
 'decided',
 'to',
 'unwind',
 'with',
 'some',
 'music',
 'however',
 'the',
 'loud',
 'construction',
 'nearby',
 'made',
 'it',
 'nearly',
 'impossible',
 'to',
 'relax']

In [73]:
print("{0:20}{1:20}".format("Word","Lemma"))
for word in words:
    print("{0:20}{1:20}".format(word,wnl.lemmatize(word,pos='v')))

Word                Lemma               
After               After               
a                   a                   
long                long                
day                 day                 
at                  at                  
work                work                
I                   I                   
decided             decide              
to                  to                  
unwind              unwind              
with                with                
some                some                
music               music               
however             however             
the                 the                 
loud                loud                
construction        construction        
nearby              nearby              
made                make                
it                  it                  
nearly              nearly              
impossible          impossible          
to                  to                  
relax           