In [124]:
import re
import string
import random
import requests as req
import pandas as pd
from bs4 import BeautifulSoup
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt_tab') # download punkit tokenization model that will be used by word_tokenize

[nltk_data] Downloading package stopwords to /home/Leo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/Leo/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [57]:
# fetch movies names descriptions and genres from given api

movies_df = pd.DataFrame()

url = "https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page="
total_pages = 100
movies_data = []

for p in range(1, total_pages+1):
    final_url = url + str(p)
    res = req.get(final_url)
    if(res.status_code==200):
        data = res.json()
        movies_data.extend(data['results'])

print('complete!')

complete!


In [58]:
# fetch genres against their id from the given api

genres_dict = {}

url = "https://api.themoviedb.org/3/genre/movie/list?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US"

res = req.get(url)
if(res.status_code == 200):
    data = res.json()
    genres_dict = {genre['id']: genre['name'] for genre in data['genres']}
    print(genres_dict)

print('complete!')

{28: 'Action', 12: 'Adventure', 16: 'Animation', 35: 'Comedy', 80: 'Crime', 99: 'Documentary', 18: 'Drama', 10751: 'Family', 14: 'Fantasy', 36: 'History', 27: 'Horror', 10402: 'Music', 9648: 'Mystery', 10749: 'Romance', 878: 'Science Fiction', 10770: 'TV Movie', 53: 'Thriller', 10752: 'War', 37: 'Western'}
complete!


In [72]:
# convert json data into pandas datafram ad show first 5 rows

movies_df = pd.DataFrame(movies_data)
movies_df = movies_df[['title', 'genre_ids', 'overview']]
movies_df.head()

Unnamed: 0,title,genre_ids,overview
0,The Shawshank Redemption,"[18, 80]",Imprisoned in the 1940s for the double murder ...
1,The Godfather,"[18, 80]","Spanning the years 1945 to 1955, a chronicle o..."
2,The Godfather Part II,"[18, 80]",In the continuing saga of the Corleone crime f...
3,Schindler's List,"[18, 36, 10752]",The true story of how businessman Oskar Schind...
4,12 Angry Men,[18],The defense and the prosecution have rested an...


In [73]:
# add new column of genre names by matching genre_ids from the fetched genres dictionary

movies_df['genres'] = movies_df['genre_ids'].apply(lambda ids: ', '.join([genres_dict[i] for i in ids]))
movies_df = movies_df.drop(columns = ['genre_ids'])
movies_df.head()

Unnamed: 0,title,overview,genres
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
3,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"
4,12 Angry Men,The defense and the prosecution have rested an...,Drama


In [77]:
# now it's time for text preprocessing
# text preprocessing step 1: lower casing

movies_df['title'] = movies_df['title'].apply(lambda title: title.lower())
movies_df['overview'] = movies_df['overview'].apply(lambda overview: overview.lower())
movies_df['genres'] = movies_df['genres'].apply(lambda genres: genres.lower())

movies_df.head()

Unnamed: 0,title,overview,genres
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,"drama, crime"
1,the godfather,"spanning the years 1945 to 1955, a chronicle o...","drama, crime"
2,the godfather part ii,in the continuing saga of the corleone crime f...,"drama, crime"
3,schindler's list,the true story of how businessman oskar schind...,"drama, history, war"
4,12 angry men,the defense and the prosecution have rested an...,drama


In [83]:
# text preprocessing step 2: remove html tags
# first let's add html tags for testing purposes

def add_random_html_tag_to_beginning(text, tag_probability=0.5):
    html_tags = ['<b>', '<i>', '<u>', '<em>', '<strong>']
    if random.random() < tag_probability:
        tag = random.choice(html_tags)
        return tag + text
    return text

movies_df['overview'] = movies_df['overview'].apply(lambda overview: add_random_html_tag_to_beginning(overview))
movies_df.head(10)

Unnamed: 0,title,overview,genres
0,the shawshank redemption,<strong><em>imprisoned in the 1940s for the do...,"drama, crime"
1,the godfather,"<strong><em>spanning the years 1945 to 1955, a...","drama, crime"
2,the godfather part ii,<u>in the continuing saga of the corleone crim...,"drama, crime"
3,schindler's list,<em>the true story of how businessman oskar sc...,"drama, history, war"
4,12 angry men,<strong><i>the defense and the prosecution hav...,drama
5,spirited away,"<u><em>a young girl, chihiro, becomes trapped ...","animation, family, fantasy"
6,dilwale dulhania le jayenge,"<strong>raj is a rich, carefree, happy-go-luck...","comedy, drama, romance"
7,the dark knight,<b>batman raises the stakes in his war on crim...,"drama, action, crime, thriller"
8,the green mile,<strong><strong><em>a supernatural tale set on...,"fantasy, drama, crime"
9,parasite,"<i><u>all unemployed, ki-taek's family takes p...","comedy, thriller, drama"


In [86]:
# text preprocessing step 2: removing html tags

movies_df["overview"] = movies_df["overview"].apply(lambda overview: BeautifulSoup(overview, "html.parser").get_text())
movies_df.head(10)

  movies_df["overview"] = movies_df["overview"].apply(lambda overview: BeautifulSoup(overview, "html.parser").get_text())


Unnamed: 0,title,overview,genres
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,"drama, crime"
1,the godfather,"spanning the years 1945 to 1955, a chronicle o...","drama, crime"
2,the godfather part ii,in the continuing saga of the corleone crime f...,"drama, crime"
3,schindler's list,the true story of how businessman oskar schind...,"drama, history, war"
4,12 angry men,the defense and the prosecution have rested an...,drama
5,spirited away,"a young girl, chihiro, becomes trapped in a st...","animation, family, fantasy"
6,dilwale dulhania le jayenge,"raj is a rich, carefree, happy-go-lucky second...","comedy, drama, romance"
7,the dark knight,batman raises the stakes in his war on crime. ...,"drama, action, crime, thriller"
8,the green mile,a supernatural tale set on death row in a sout...,"fantasy, drama, crime"
9,parasite,"all unemployed, ki-taek's family takes peculia...","comedy, thriller, drama"


In [89]:
# text preprocessing step 3: remove urls (using regular expression matching)

pattern = r'http[s]?://\S+|www\.\S+'
movies_df['overview'] = movies_df['overview'].apply(lambda overview: re.sub(pattern, '', overview))
movies_df.head(10)

Unnamed: 0,title,overview,genres
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,"drama, crime"
1,the godfather,"spanning the years 1945 to 1955, a chronicle o...","drama, crime"
2,the godfather part ii,in the continuing saga of the corleone crime f...,"drama, crime"
3,schindler's list,the true story of how businessman oskar schind...,"drama, history, war"
4,12 angry men,the defense and the prosecution have rested an...,drama
5,spirited away,"a young girl, chihiro, becomes trapped in a st...","animation, family, fantasy"
6,dilwale dulhania le jayenge,"raj is a rich, carefree, happy-go-lucky second...","comedy, drama, romance"
7,the dark knight,batman raises the stakes in his war on crime. ...,"drama, action, crime, thriller"
8,the green mile,a supernatural tale set on death row in a sout...,"fantasy, drama, crime"
9,parasite,"all unemployed, ki-taek's family takes peculia...","comedy, thriller, drama"


In [93]:
# text preprocessing step 4: remove punctuations

def remove_punc(text):
    return text.translate(str.maketrans('','',string.punctuation))


movies_df['title'] = movies_df['title'].apply(remove_punc)
movies_df['overview'] = movies_df['overview'].apply(remove_punc)
movies_df['genres'] = movies_df['genres'].apply(remove_punc)

movies_df.head(10)

Unnamed: 0,title,overview,genres
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,drama crime
1,the godfather,spanning the years 1945 to 1955 a chronicle of...,drama crime
2,the godfather part ii,in the continuing saga of the corleone crime f...,drama crime
3,schindlers list,the true story of how businessman oskar schind...,drama history war
4,12 angry men,the defense and the prosecution have rested an...,drama
5,spirited away,a young girl chihiro becomes trapped in a stra...,animation family fantasy
6,dilwale dulhania le jayenge,raj is a rich carefree happygolucky second gen...,comedy drama romance
7,the dark knight,batman raises the stakes in his war on crime w...,drama action crime thriller
8,the green mile,a supernatural tale set on death row in a sout...,fantasy drama crime
9,parasite,all unemployed kitaeks family takes peculiar i...,comedy thriller drama


In [94]:
# text preprocessing step 5: chat words treatment (fixing short from words like lmao, asap, hlo etc etc)

# do it yourself
# make a dictionary of full forms of a few words
# replace the chat words in overview to their full abbreviations

In [None]:
# text preprocessing step 6: spelling correction (using textblob, a python library for spelling correction)
# execution might take longer than usual depending on the dataset size and hardware

def correct_spellings(text):
    blob = TextBlob(text)
    return str(blob.correct())

movies_df['title'] = movies_df['title'].apply(lambda title: correct_spellings(title))
movies_df['overview'] = movies_df['overview'].apply(lambda overview: correct_spellings(overview))
movies_df['genres'] = movies_df['genres'].apply(lambda genre: correct_spellings(genre))

movies_df.head(10)

In [102]:
# text preprocessing step 7: stop words removal (using nltk, a python library for nlp text preprocessing)

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

movies_df['title'] = movies_df['title'].apply(remove_stopwords)
movies_df['overview'] = movies_df['overview'].apply(remove_stopwords)
movies_df['genres'] = movies_df['genres'].apply(remove_stopwords)

movies_df.head(10)

[nltk_data] Downloading package stopwords to /home/Leo/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,title,overview,genres
0,shawshank redemption,imprisoned 1940s double murder wife lover upst...,drama crime
1,godfather,spanning years 1945 1955 chronicle fictional i...,drama crime
2,godfather part ii,continuing saga corleone crime family young vi...,drama crime
3,schindlers list,true story businessman oskar schindler saved t...,drama history war
4,12 angry men,defense prosecution rested jury filing jury ro...,drama
5,spirited away,young girl chihiro becomes trapped strange new...,animation family fantasy
6,dilwale dulhania le jayenge,raj rich carefree happygolucky second generati...,comedy drama romance
7,dark knight,batman raises stakes war crime help lt jim gor...,drama action crime thriller
8,green mile,supernatural tale set death row southern priso...,fantasy drama crime
9,parasite,unemployed kitaeks family takes peculiar inter...,comedy thriller drama


In [103]:
# text preprocessing step 8: handling emojis
# replace emojis with their relevant word like happy, sad etc etc

In [127]:
# text preprocessing step 9: Tokenization

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

movies_df['tokenized_title'] = movies_df['title'].apply(tokenize)
movies_df['tokenized_overview'] = movies_df['overview'].apply(tokenize)
movies_df['tokenized_genres'] = movies_df['genres'].apply(tokenize)

movies_df.head(10)

Unnamed: 0,title,overview,genres,tokenized_title,tokenized_overview,tokenized_genres
0,shawshank redemption,imprisoned 1940s double murder wife lover upst...,drama crime,"[shawshank, redemption]","[imprisoned, 1940s, double, murder, wife, love...","[drama, crime]"
1,godfather,spanning years 1945 1955 chronicle fictional i...,drama crime,[godfather],"[spanning, years, 1945, 1955, chronicle, ficti...","[drama, crime]"
2,godfather part ii,continuing saga corleone crime family young vi...,drama crime,"[godfather, part, ii]","[continuing, saga, corleone, crime, family, yo...","[drama, crime]"
3,schindlers list,true story businessman oskar schindler saved t...,drama history war,"[schindlers, list]","[true, story, businessman, oskar, schindler, s...","[drama, history, war]"
4,12 angry men,defense prosecution rested jury filing jury ro...,drama,"[12, angry, men]","[defense, prosecution, rested, jury, filing, j...",[drama]
5,spirited away,young girl chihiro becomes trapped strange new...,animation family fantasy,"[spirited, away]","[young, girl, chihiro, becomes, trapped, stran...","[animation, family, fantasy]"
6,dilwale dulhania le jayenge,raj rich carefree happygolucky second generati...,comedy drama romance,"[dilwale, dulhania, le, jayenge]","[raj, rich, carefree, happygolucky, second, ge...","[comedy, drama, romance]"
7,dark knight,batman raises stakes war crime help lt jim gor...,drama action crime thriller,"[dark, knight]","[batman, raises, stakes, war, crime, help, lt,...","[drama, action, crime, thriller]"
8,green mile,supernatural tale set death row southern priso...,fantasy drama crime,"[green, mile]","[supernatural, tale, set, death, row, southern...","[fantasy, drama, crime]"
9,parasite,unemployed kitaeks family takes peculiar inter...,comedy thriller drama,[parasite],"[unemployed, kitaeks, family, takes, peculiar,...","[comedy, thriller, drama]"
