In [640]:
import pandas as pd
import re
import string
import time
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import spacy
from nltk.stem import PorterStemmer, WordNetLemmatizer
import json
import requests

In [476]:
df = pd.read_csv('imdbDataset.csv')

In [477]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [24]:
df['review'] = df['review'].str.lower()

In [26]:
df.review[3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

## HTML REMOVAL

In [642]:
def removeHtml(text:str) -> str:
    text = re.sub('<.*?/>', '', text)
    return text

In [46]:
removeHtml(df.review[3])

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [49]:
df['review'] = df['review'].apply(removeHtml)

In [50]:
df.review

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [155]:
text = "for notebook click www://localhost:8888/notebooks/textPreprocessing.ipynb. we do not condemn anyone from this place. Therefore we do not account for them. "



## URL REMOVAL

In [643]:
def removeUrl(text):
    pattern = re.compile(r'https\S+|www\S+|http\S+')
    text =  pattern.sub('', text)
    return text

In [159]:
removeUrl(text)

'for notebook click  we do not condemn anyone from this place. Therefore we do not account for them. '

## PUNCTUATION REMOVAL

In [209]:
punctuation = """|@#$%^&*!(){}],|:><';["""
def removePunctuation(text):
    for char in punctuation:
        text = text.replace(char, '')
    return text
start = time.time()
removePunctuation('hi, there whas up! home[[];')
print(time.time() - start)

0.00016188621520996094


In [644]:
def remove_punc(text):
    return text.translate(str.maketrans('','', string.punctuation))

start = time.time()
print(remove_punc('hi, their whas up! home[[];'))
print(time.time() - start)

hi their whas up home
0.0009667873382568359


In [252]:
shorthand = {
    "asap": "as soon as possible",
    "btw": "by the way",
    "brb": "be right back",
    "idk": "I don't know",
    "ttyl": "talk to you later",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "lmao": "laughing my ass off",
    "bff": "best friends forever",
    "smh": "shaking my head",
    "fyi": "for your information",
    "np": "no problem",
    "ttfn": "ta-ta for now",
    "rofl": "rolling on the floor laughing",
    "tbh": "to be honest",
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "ppl": "people",
    "gm": "good morning",
    "gn": "good night",
    "gtg": "got to go",
    "yolo": "you only live once",
    "fomo": "fear of missing out",
    "nvm": "never mind",
    "omw": "on my way",
    "l8r": "later",
    "wyd": "what you doing",
    "b4": "before",
    "cu": "see you",
    "cya": "see you",
    "gr8": "great",
    "plz": "please",
    "thx": "thanks",
    "yw": "you're welcome",
    "g2g": "got to go",
    "xoxo": "hugs and kisses",
    "wbu": "what about you",
    "2moro": "tomorrow",
    "k": "okay",
    "glhf": "good luck, have fun",
    "bbl": "be back later",
    "btwn": "between",
    "tmi": "too much information",
    "ikr": "I know right",
    "np": "no problem",
    "m8": "mate",
    "cmon": "come on",
    "n1": "nice one",
    "pplz": "people",
    "pic": "picture",
    "pls": "please"
}


In [637]:
text = '''Hey, I just got home! It was such a busy day, and I couldn't stop working. I'm so tired, but tbh, it was a productive day. LOL. I had a meeting with my team, and we talked about our project progress. SMH at how much we still have to do, though! Anyway, I had to rush to meet some friends, but I had to leave early because I had to gtg. Btw, have you heard about that new game? I heard it's gr8! I think I’ll try it out tomorrow. FYI, I’m planning to catch up on some sleep, but I might go out later. TTYL! Hope you have a good night, and don't forget to text me if you need anything. BFF, remember?'''



## SHORTHAND REMOVAL

In [645]:
def chat_conversion(text):
    li = []
    text = remove_punc(text).lower()
    for word in text.split():
        if word in shorthand:
            li.append(shorthand[word])
        else:
            li.append(word)
    return li

In [639]:
chat_conversion(text)

['hey',
 'i',
 'just',
 'got',
 'home',
 'it',
 'was',
 'such',
 'a',
 'busy',
 'day',
 'and',
 'i',
 'couldnt',
 'stop',
 'working',
 'im',
 'so',
 'tired',
 'but',
 'to be honest',
 'it',
 'was',
 'a',
 'productive',
 'day',
 'laugh out loud',
 'i',
 'had',
 'a',
 'meeting',
 'with',
 'my',
 'team',
 'and',
 'we',
 'talked',
 'about',
 'our',
 'project',
 'progress',
 'shaking my head',
 'at',
 'how',
 'much',
 'we',
 'still',
 'have',
 'to',
 'do',
 'though',
 'anyway',
 'i',
 'had',
 'to',
 'rush',
 'to',
 'meet',
 'some',
 'friends',
 'but',
 'i',
 'had',
 'to',
 'leave',
 'early',
 'because',
 'i',
 'had',
 'to',
 'got to go',
 'by the way',
 'have',
 'you',
 'heard',
 'about',
 'that',
 'new',
 'game',
 'i',
 'heard',
 'its',
 'great',
 'i',
 'think',
 'i’ll',
 'try',
 'it',
 'out',
 'tomorrow',
 'for your information',
 'i’m',
 'planning',
 'to',
 'catch',
 'up',
 'on',
 'some',
 'sleep',
 'but',
 'i',
 'might',
 'go',
 'out',
 'later',
 'talk to you later',
 'hope',
 'you',
 '

In [259]:
incorrect_text = "I coudn't beleive how much fun we had at the park yesturday!"



In [268]:
textblob = TextBlob(incorrect_text)

textblob.correct().string

"I couldn't believe how much fun we had at the park yesterday!"

## STOPWORDS REMOVAL

In [646]:
def stopwords_removal(text):
    li = []
    stops = stopwords.words('english')
    for word in text.split():
        if word in stops:
            pass
        else:
            li.append(word)
    return ' '.join(li)
        

In [306]:
stopwords_removal(df['review'][3])

"basically there's family little boy (jake) thinks there's zombie closet & parents fighting time.this movie slower soap opera... suddenly, jake decides become rambo kill zombie.ok, first going make film must decide thriller drama! drama movie watchable. parents divorcing & arguing like real life. jake closet totally ruins film! expected see boogeyman similar movie, instead watched drama meaningless thriller spots.3 10 well playing parents & descent dialogs. shots jake: ignore them."

## EMOJI REMOVAL

In [647]:
sentence = "I’m feeling happy 😀😊, excited 🎉, and ready for an adventure 🌍. The weather is sunny 🌞, so I’m going to the beach 🏖️ with my friends 👯‍♀️. We’ll have some pizza 🍕, ice cream 🍦, and drinks 🍹. Afterward, I plan to relax 🛋️ and watch a movie 🎬."


import re

def remove_emoji(text):
    emoji_pattern = re.compile('''[u"\U0001F600-\U0001F64F"u"\U0001F300-\U0001F5FF"u"\U0001F680-\U0001F6FF"u"\U0001F1E0-\U0001F1FF"u"\U00002702-\U000027B0"u"\U000024C2-\U0001F251]+''', flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [385]:
remove_emoji(sentence)

'I’m feeling happy , excited , and ready for an adventre . The weather is snny , so I’m going to the beach  with my friends \u200d. We’ll have some pizza , ice cream , and drinks . Afterward, I plan to relax  and watch a movie .'

In [388]:
import emoji

In [390]:
print(emoji.demojize("I’m feeling happy 😊"))

I’m feeling happy :smiling_face_with_smiling_eyes:


## Tokenization

In [419]:
sent = 'I am going to delhi!'


In [420]:
sent.split()

['I', 'am', 'going', 'to', 'delhi!']

In [421]:
sent.split('.')

['I am going to delhi!']

In [422]:
re.findall(r"[\w]+", sent)

['I', 'am', 'going', 'to', 'delhi']

In [423]:
word_tokenize(sent)

['I', 'am', 'going', 'to', 'delhi', '!']

In [424]:
nlp = spacy.load('en_core_web_sm')

In [425]:
doc = nlp(sent)

In [426]:
list(doc)

[I, am, going, to, delhi, !]

## Stemming

In [429]:
sent = 'walk taught tell walked washed taught naught fought'

In [648]:
ps = PorterStemmer()

words = [ps.stem(word) for word in sent.split()]

words

['walk', 'taught', 'tell', 'walk', 'wash', 'taught', 'naught', 'fought']

In [433]:
lemmatizer = WordNetLemmatizer()


words = [lemmatizer.lemmatize(word, pos='v')for word in sent.split()]

words

['walk', 'teach', 'tell', 'walk', 'wash', 'teach', 'naught', 'fight']

In [720]:
d = {'title':[],'description':[], 'genre':[]}

In [703]:
genre_url = f"https://api.themoviedb.org/3/genre/movie/list?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US"

genre = requests.get(genre_url).json()['genres']

genre_data = {}
for genre_dic in genre:
    genre_data[genre_dic['id']] = genre_dic['name']

In [708]:
genre_data

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

In [721]:
for index in range(1, 496):
    movie_url = f"https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page={index}"    
    movies = requests.get(movie_url).json()['results']
    for movie in movies:
        d['title'].append(movie['original_title'])
        d['description'].append(movie['overview'])
        genres = movie['genre_ids']
        names = [genre_data.get(genre) for genre in genres]
        d['genre'].append(names)

In [744]:
df = pd.DataFrame(d)

df

Unnamed: 0,title,description,genre
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[Drama, Crime]"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[Drama, Crime]"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[Drama, Crime]"
3,Schindler's List,The true story of how businessman Oskar Schind...,"[Drama, History, War]"
4,12 Angry Men,The defense and the prosecution have rested an...,[Drama]
...,...,...,...
9886,Alone in the Dark,Edward Carnby is a private investigator specia...,"[Action, Fantasy, Horror]"
9887,Disaster Movie,"Over the course of one evening, an unsuspectin...",[Comedy]
9888,House of the Dead,"Set on an island off the coast, a techno rave ...","[Horror, Action, Thriller]"
9889,Dragonball Evolution,"On his 18th birthday, Goku receives a mystical...","[Action, Adventure, Fantasy, Science Fiction, ..."


In [746]:

df.genre = df['genre'].apply(str).replace(r"['[\]]", '', regex=True)

In [747]:
df

Unnamed: 0,title,description,genre
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
3,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"
4,12 Angry Men,The defense and the prosecution have rested an...,Drama
...,...,...,...
9886,Alone in the Dark,Edward Carnby is a private investigator specia...,"Action, Fantasy, Horror"
9887,Disaster Movie,"Over the course of one evening, an unsuspectin...",Comedy
9888,House of the Dead,"Set on an island off the coast, a techno rave ...","Horror, Action, Thriller"
9889,Dragonball Evolution,"On his 18th birthday, Goku receives a mystical...","Action, Adventure, Fantasy, Science Fiction, T..."


In [748]:
df.description = df.description.str.lower()

corpus = ''

for i in df.description:
    corpus = corpus + i 

sentences = sent_tokenize(corpus)

In [758]:
corpus = []

In [759]:
for sentence in sentences:
    htmlRemoved = removeHtml(sentence)
    urlRemoved = removeUrl(htmlRemoved)
    puncRemoved = remove_punc(urlRemoved)
    corrected = TextBlob(puncRemoved)
    stopword_removed = stopwords_removal(corrected)
    emoji_removed = remove_emoji(stopword_removed)
    tokens = word_tokenize(emoji_removed)
    stems = [lemmatizer.lemmatize(token) for token in tokens]
    corpus.append(' '.join(stems))
    

In [760]:
corpus

['imprisoned 1940s doble mrder wife lover pstanding banker andy dfresne begin new life shawshank prison pt acconting skill work amoral warden',
 'long stretch prison dfresne come admired inmate inclding older prisoner named red integrity nqenchable sense hopespanning year 1945 1955 chronicle fictional italianamerican corleone crime family',
 'organized crime family patriarch vito corleone barely srvives attempt life yongest son michael step take care woldbe killer lanching campaign bloody revengein contining saga corleone crime family yong vito corleone grows sicily 1910s new york',
 '1950s michael corleone attempt expand family bsiness la vega hollywood cbathe tre story bsinessman oskar schindler saved thosand jewish life nazi worked slave factory world war iithe defense prosection rested jry filing jry room decide yong spanishamerican gilty innocent mrdering father',
 'begin open sht case soon becomes minidrama jrors prejdices preconception trial accsed othera yong girl chihiro becom