# Text Preprocessing 


In [1]:
import pandas as pd

### Lower Casing 

In [2]:
df = pd.read_csv("IMDB Dataset.csv")

In [18]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


In [4]:
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [5]:
df['review']=df['review'].str.lower()

## Removing Unimportant Things 

i.) Remove HTML Tags

In [22]:
text = 'a wonderful little production. <br /><br />the...'

In [26]:
import re 
def remove_htm_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)


In [25]:
remove_htm_tags(text)

'a wonderful little production. the...'

In [27]:
df['review'].apply(remove_htm_tags)

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

## Removing URLS 

In [28]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)


In [29]:
text1 = "checkout the link : https://www.linkedin.com/in/ankkkyyyy/"
text2 = "my github link :  https://github.com/Ankkkyyyy"
text3 = "my all links :  https://ankkkyyyy.github.io/mylinks/ "

In [30]:
remove_url(text1)

'checkout the link : '

In [31]:
remove_url(text2)

'my github link :  '

In [32]:
remove_url(text3)

'my all links :   '

## Remove Punctuations

In [34]:
import  string,time
string.punctuation 

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [35]:
exclude = string.punctuation

In [36]:
def remove_punctuation(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

In [43]:
textP = "Hello. Howz uh ?"

In [39]:
print(remove_punctuation(textP))

Hello Howz uh 


Yeah user defined func jo humne banaya yeah slow hai so will
use another way 

In [41]:
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))

In [44]:
print(remove_punctuation(textP))

Hello Howz uh 


### Removing Stop Words 

In [57]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
stopwords.words('English')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [66]:
def remove_stopwords(text):
    new_txt = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_txt.append('')
        else:
            new_txt.append(word)
    x = new_txt[:]
    new_txt.clear()
    return " ".join(x)


In [67]:
remove_stopwords("Hey howz uh i am good & proficient & handsome")

'Hey howz uh   good & proficient & handsome'

 ### applying to a dataset 

In [68]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [77]:
test2=df.head()

In [80]:
test2['review'].apply(remove_stopwords)

0    one    reviewers  mentioned   watching  1 oz e...
1     wonderful little production. <br /><br />the ...
2     thought    wonderful way  spend time    hot s...
3    basically there's  family   little boy (jake) ...
4    petter mattei's "love   time  money"   visuall...
Name: review, dtype: object

## Handling Emojis 

1.) Removing Emoji 

In [107]:
import re

def remove_emoji(text):
    emoji_pattern = re.compile("["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E8-\U0001F1FF"  # flags
        "\U00002782-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', text)


In [108]:
remove_emoji('i love you  ❤️❤️❤️')

'i love you   '

In [109]:
remove_emoji('Ankit 🔥 is Genius ⚡')

'Ankit   is Genius  '

2.) Replacing Emoji

In [119]:
import emoji 
print(emoji.demojize('Ankit🔥'))

Ankit:fire:
