# 5000 Movies Dataset Preprocessing

In [83]:
import pandas as pd 
import numpy as np

In [84]:
df=pd.read_csv("tmdb_5000_movies.csv ")

In [85]:
df.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
1815,25000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 53, ""nam...",,8681,"[{""id"": 90, ""name"": ""paris""}, {""id"": 1930, ""na...",en,Taken,"While vacationing with a friend in Paris, an A...",80.879032,"[{""name"": ""Twentieth Century Fox Film Corporat...","[{""iso_3166_1"": ""FR"", ""name"": ""France""}, {""iso...",2008-02-18,226830568,93.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,They took his daughter. He'll take their lives.,Taken,7.2,4369


In [86]:
df=df[['title','overview']]

In [87]:
df.head()

Unnamed: 0,title,overview
0,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,Spectre,A cryptic message from Bond’s past sends him o...
3,The Dark Knight Rises,Following the death of District Attorney Harve...
4,John Carter,"John Carter is a war-weary, former military ca..."


In [88]:
df.shape

(4803, 2)

In [89]:
df.overview[3]

"Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protect the late attorney's reputation and is subsequently hunted by the Gotham City Police Department. Eight years later, Batman encounters the mysterious Selina Kyle and the villainous Bane, a new terrorist leader who overwhelms Gotham's finest. The Dark Knight resurfaces to protect a city that has branded him an enemy."

In [90]:
df.isnull().sum()

title       0
overview    3
dtype: int64

In [91]:
df.duplicated().sum()

0

## Removing null Values

In [92]:
df.dropna(inplace=True)

In [93]:
df.isnull().sum()

title       0
overview    0
dtype: int64

## Preprocessing On Overview Column

In [94]:
df.head()

Unnamed: 0,title,overview
0,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,Spectre,A cryptic message from Bond’s past sends him o...
3,The Dark Knight Rises,Following the death of District Attorney Harve...
4,John Carter,"John Carter is a war-weary, former military ca..."


## Removing Chat Words

In [95]:
chatwords = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "LOL": "Laughing out loud",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don’t care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "IDC": "I don’t care",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "LMAO": "Laughing my a** off",
    "BFF": "Best friends forever",
    "CSL": "Can’t stop laughing"
}

print(chatwords)


{'AFAIK': 'As Far As I Know', 'AFK': 'Away From Keyboard', 'ASAP': 'As Soon As Possible', 'ATK': 'At The Keyboard', 'ATM': 'At The Moment', 'A3': 'Anytime, Anywhere, Anyplace', 'BAK': 'Back At Keyboard', 'BBL': 'Be Back Later', 'BBS': 'Be Back Soon', 'BFN': 'Bye For Now', 'B4N': 'Bye For Now', 'BRB': 'Be Right Back', 'BRT': 'Be Right There', 'BTW': 'By The Way', 'B4': 'Before', 'CU': 'See You', 'CUL8R': 'See You Later', 'CYA': 'See You', 'FAQ': 'Frequently Asked Questions', 'FC': 'Fingers Crossed', 'FWIW': "For What It's Worth", 'FYI': 'For Your Information', 'GAL': 'Get A Life', 'GG': 'Good Game', 'GN': 'Good Night', 'GMTA': 'Great Minds Think Alike', 'GR8': 'Great!', 'G9': 'Genius', 'IC': 'I See', 'ICQ': 'I Seek you (also a chat program)', 'ILU': 'ILU: I Love You', 'IMHO': 'In My Honest/Humble Opinion', 'IMO': 'In My Opinion', 'IOW': 'In Other Words', 'IRL': 'In Real Life', 'KISS': 'Keep It Simple, Stupid', 'LDR': 'Long Distance Relationship', 'LMAO': 'Laughing my a** off', 'LOL': 'L

In [96]:
def chat_conversation(text):
    new_text=[]
    for word in text.split():
        if word.upper() in chatwords:
            new_text.append(chatwords[word.upper()])
        else:
            new_text.append(word)
    return " ".join(new_text)        

In [97]:
chat_conversation('IMHO he is the best')

'In My Honest/Humble Opinion he is the best'

In [98]:
df['overview']=df['overview'].apply(chat_conversation)

In [99]:
df['overview'][3]

"Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protect the late attorney's reputation and is subsequently hunted by the Gotham City Police Department. Eight years later, Batman encounters the mysterious Selina Kyle and the villainous Bane, a new terrorist leader who overwhelms Gotham's finest. The Dark Knight resurfaces to protect a city that has branded him an enemy."

## Removing Stopwords

In [100]:
import nltk
from nltk.corpus import stopwords


In [101]:
stopwords=stopwords.words('english')

In [102]:
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [103]:
lst=[]
def remove_stopwords(text):
    filtered=[word for word in text.split() if word not in stopwords]
    return " ".join(filtered)

In [104]:
df['overview']=df['overview'].apply(remove_stopwords)

In [105]:
df.overview[3]

"Following death District Attorney Harvey Dent, Batman assumes responsibility Dent's crimes protect late attorney's reputation subsequently hunted Gotham City Police Department. Eight years later, Batman encounters mysterious Selina Kyle villainous Bane, new terrorist leader overwhelms Gotham's finest. The Dark Knight resurfaces protect city branded enemy."

## Spelling Correction

In [106]:
from textblob import TextBlob


In [107]:
def spelling_correction(text):
    blb=TextBlob(text)
    return blb.correct().string

In [108]:
spelling_correction("He is really certane amount of time")

'He is really certain amount of time'

In [None]:
df['overview']=df['overview'].apply(spelling_correction)

In [None]:
df.overview[3]

## Remove Punctuations

In [468]:
import string
from string import punctuation

In [469]:
def remove_punctuation(text):
    for char in text:
        if char in punctuation:
            text=text.replace(char,"")
    return text      

In [470]:
df['overview']=df['overview'].apply(remove_punctuation)

In [471]:
df.overview[3]

'following death district attorney harvey dent batman assumes responsibility dents crimes protect late attorneys reputation subsequently hunted gotham city police department eight years later batman encounters mysterious selina kyle villainous bane new terrorist leader overwhelms gothams finest dark knight resurfaces protect city branded enemy'

## Convert the Text to Lower

In [31]:
df['overview']=df['overview'].str.lower()

## Tokenization

In [472]:
from nltk.tokenize import word_tokenize

In [473]:
def tokenization(text):
    word_token=word_tokenize(text)
    return word_token

In [474]:
df['overview']=df['overview'].apply(tokenization)

In [475]:
df.overview[3]

['following',
 'death',
 'district',
 'attorney',
 'harvey',
 'dent',
 'batman',
 'assumes',
 'responsibility',
 'dents',
 'crimes',
 'protect',
 'late',
 'attorneys',
 'reputation',
 'subsequently',
 'hunted',
 'gotham',
 'city',
 'police',
 'department',
 'eight',
 'years',
 'later',
 'batman',
 'encounters',
 'mysterious',
 'selina',
 'kyle',
 'villainous',
 'bane',
 'new',
 'terrorist',
 'leader',
 'overwhelms',
 'gothams',
 'finest',
 'dark',
 'knight',
 'resurfaces',
 'protect',
 'city',
 'branded',
 'enemy']

## Lemmatization

In [476]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [477]:
def lemmatize_text(text):
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in text]
    return lemmatized_tokens

In [480]:
df['overview'] = df['overview'].apply(lemmatize_text)

In [481]:
df['overview'][3]

['following',
 'death',
 'district',
 'attorney',
 'harvey',
 'dent',
 'batman',
 'assumes',
 'responsibility',
 'dent',
 'crime',
 'protect',
 'late',
 'attorney',
 'reputation',
 'subsequently',
 'hunted',
 'gotham',
 'city',
 'police',
 'department',
 'eight',
 'year',
 'later',
 'batman',
 'encounter',
 'mysterious',
 'selina',
 'kyle',
 'villainous',
 'bane',
 'new',
 'terrorist',
 'leader',
 'overwhelms',
 'gothams',
 'finest',
 'dark',
 'knight',
 'resurfaces',
 'protect',
 'city',
 'branded',
 'enemy']