### Data Acquisition

In [1]:
import requests
import pandas as pd

def load_json_from_api(api_url):
    response = requests.get(api_url)
    if response.status_code == 200:
        json_data = response.json()
        return json_data
    else:
        print(f'Error: {response.status_code}')
        return None

api_url = 'https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page='

dfs = []

for page in range(1, 501):
    current_api_url = api_url + str(page)
    json_data = load_json_from_api(current_api_url)
    
    if json_data is not None:
        df = pd.DataFrame(json_data)
        df = df.reset_index()
        dfs.append(df)
        
combined_df = pd.concat(dfs)

filename = 'tmdb.json'
combined_df.to_json(filename, orient='records')
print(f'Combined DataFrame saved to {filename}')

Combined DataFrame saved to tmdb.json


In [9]:
tmdb = pd.read_json('tmdb.json')
tmdb

Unnamed: 0,index,page,results,total_pages,total_results
0,0,1,"{'adult': False, 'backdrop_path': '/tmU7GeKVyb...",562,11232
1,1,1,"{'adult': False, 'backdrop_path': '/kXfqcdQKsT...",562,11232
2,2,1,"{'adult': False, 'backdrop_path': '/kGzFbGhp99...",562,11232
3,3,1,"{'adult': False, 'backdrop_path': '/vI3aUGTuRR...",562,11232
4,4,1,"{'adult': False, 'backdrop_path': '/zb6fM1CX41...",562,11232
...,...,...,...,...,...
9995,15,500,"{'adult': False, 'backdrop_path': '/qkBWIZRaeM...",562,11232
9996,16,500,"{'adult': False, 'backdrop_path': '/t2hqoksHqb...",562,11232
9997,17,500,"{'adult': False, 'backdrop_path': '/mm1JTSYpaF...",562,11232
9998,18,500,"{'adult': False, 'backdrop_path': '/yvB9wgUNyA...",562,11232


### json Data Preprocess

In [10]:
import json
import pandas as pd

with open('data.json') as file:
    json_data = json.load(file)
    

data = []

for item in json_data:
    results = item.get('results')
    if results:
        title = results.get('title')
        overview = results.get('overview')
        genre_ids = results.get('genre_ids')
        data.append({
            'title' : title,
            'overview' : overview,
            'genre_ids' : genre_ids
        })

df = pd.DataFrame(data)
df

Unnamed: 0,title,overview,genre_ids
0,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]"
1,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"[18, 80]"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]"
3,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","[35, 18, 10749]"
4,Schindler's List,The true story of how businessman Oskar Schind...,"[18, 36, 10752]"
...,...,...,...
9415,Ma,Sue Ann is a loner who keeps to herself in her...,"[27, 53]"
9416,House at the End of the Street,A mother and daughter move to a new town and f...,"[27, 53]"
9417,Battleship,"When mankind beams a radio signal into space, ...","[53, 28, 12, 878]"
9418,Dennis the Menace,Mr. Wilson's ever-present annoyance comes in t...,"[10751, 35]"


https://api.themoviedb.org/3/genre/movie/list?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US
    
{"genres":[{"id":28,"name":"Action"},{"id":12,"name":"Adventure"},{"id":16,"name":"Animation"},{"id":35,"name":"Comedy"},{"id":80,"name":"Crime"},{"id":99,"name":"Documentary"},{"id":18,"name":"Drama"},{"id":10751,"name":"Family"},{"id":14,"name":"Fantasy"},{"id":36,"name":"History"},{"id":27,"name":"Horror"},{"id":10402,"name":"Music"},{"id":9648,"name":"Mystery"},{"id":10749,"name":"Romance"},{"id":878,"name":"Science Fiction"},{"id":10770,"name":"TV Movie"},{"id":53,"name":"Thriller"},{"id":10752,"name":"War"},{"id":37,"name":"Western"}]}

In [11]:
genre_data = {
    'genres': [
        {'id': 28, 'name': 'Action'},
        {'id': 12, 'name': 'Adventure'},
        {'id': 16, 'name': 'Animation'},
        {'id': 35, 'name': 'Comedy'},
        {'id': 80, 'name': 'Crime'},
        {'id': 99, 'name': 'Documentary'},
        {'id': 18, 'name': 'Drama'},
        {'id': 10751, 'name': 'Family'},
        {'id': 14, 'name': 'Fantasy'},
        {'id': 36, 'name': 'History'},
        {'id': 27, 'name': 'Horror'},
        {'id': 10402, 'name': 'Music'},
        {'id': 9648, 'name': 'Mystery'},
        {'id': 10749, 'name': 'Romance'},
        {'id': 878, 'name': 'Science Fiction'},
        {'id': 10770, 'name': 'TV Movie'},
        {'id': 53, 'name': 'Thriller'},
        {'id': 10752, 'name': 'War'},
        {'id': 37, 'name': 'Western'}
    ]
}

genre_mapping = {genre['id']: genre['name'] for genre in genre_data['genres']}
df['genre'] = df['genre_ids'].apply(lambda ids: [genre_mapping[id] for id in ids] if isinstance(ids, list) else [])

df.drop('genre_ids', axis=1, inplace=True)
df

Unnamed: 0,title,overview,genre
0,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[Drama, Crime]"
1,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"[Drama, Crime]"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[Drama, Crime]"
3,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","[Comedy, Drama, Romance]"
4,Schindler's List,The true story of how businessman Oskar Schind...,"[Drama, History, War]"
...,...,...,...
9415,Ma,Sue Ann is a loner who keeps to herself in her...,"[Horror, Thriller]"
9416,House at the End of the Street,A mother and daughter move to a new town and f...,"[Horror, Thriller]"
9417,Battleship,"When mankind beams a radio signal into space, ...","[Thriller, Action, Adventure, Science Fiction]"
9418,Dennis the Menace,Mr. Wilson's ever-present annoyance comes in t...,"[Family, Comedy]"


In [12]:
df['genre'] = df['genre'].apply(lambda genres: ', '.join(genres) if isinstance(genres, list) else '')

In [13]:
df = df.rename(columns={'title': 'name', 'overview': 'description'})
df

Unnamed: 0,name,description,genre
0,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
1,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama, Crime"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
3,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","Comedy, Drama, Romance"
4,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"
...,...,...,...
9415,Ma,Sue Ann is a loner who keeps to herself in her...,"Horror, Thriller"
9416,House at the End of the Street,A mother and daughter move to a new town and f...,"Horror, Thriller"
9417,Battleship,"When mankind beams a radio signal into space, ...","Thriller, Action, Adventure, Science Fiction"
9418,Dennis the Menace,Mr. Wilson's ever-present annoyance comes in t...,"Family, Comedy"


### Lowercasing

In [14]:
# Method 1
# df['name'] = df['name'].str.lower()
df['description'] = df['description'].str.lower()
# df['genre'] = df['genre'].str.lower()

In [15]:
# Method 2
# columns_to_lower = ['name','description','genre']
# for column in columns_to_lower:
#     df[column] = df[column].str.lower()

In [16]:
df

Unnamed: 0,name,description,genre
0,The Godfather,"spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
1,The Shawshank Redemption,framed in the 1940s for the double murder of h...,"Drama, Crime"
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"Drama, Crime"
3,Dilwale Dulhania Le Jayenge,"raj is a rich, carefree, happy-go-lucky second...","Comedy, Drama, Romance"
4,Schindler's List,the true story of how businessman oskar schind...,"Drama, History, War"
...,...,...,...
9415,Ma,sue ann is a loner who keeps to herself in her...,"Horror, Thriller"
9416,House at the End of the Street,a mother and daughter move to a new town and f...,"Horror, Thriller"
9417,Battleship,"when mankind beams a radio signal into space, ...","Thriller, Action, Adventure, Science Fiction"
9418,Dennis the Menace,mr. wilson's ever-present annoyance comes in t...,"Family, Comedy"


#### check html tags

In [17]:
import re

def has_html_tags(text):
    pattern = '<.*?>'
    matches = re.findall(pattern, text)
    
has_html = df['description'].apply(has_html_tags)
has_html

0       None
1       None
2       None
3       None
4       None
        ... 
9415    None
9416    None
9417    None
9418    None
9419    None
Name: description, Length: 9420, dtype: object

#### check urls

In [18]:
def has_urls(text):
    pattern = r'(https?://\s+)'
    matches = re.findall(pattern, text)
    
has_url = df['description'].apply(has_urls)
has_url

0       None
1       None
2       None
3       None
4       None
        ... 
9415    None
9416    None
9417    None
9418    None
9419    None
Name: description, Length: 9420, dtype: object

#### Remove punctuation

In [19]:
import string

def has_punctuations(text):
    for char in text:
        if char in string.punctuation:
            return True
        return False

has_punctuation = df['description'].apply(has_punctuations)
has_punctuation

0       False
1       False
2       False
3       False
4       False
        ...  
9415    False
9416    False
9417    False
9418    False
9419    False
Name: description, Length: 9420, dtype: object

In [20]:
def remove_punctuation(text):
    for char in string.punctuation:
        text = text.replace(char, '')
    return text

df['description'] = df['description'].apply(remove_punctuation)
has_punctuation = df['description'].apply(lambda x: bool(re.search('['+string.punctuation+']', x)))
has_punctuation

0       False
1       False
2       False
3       False
4       False
        ...  
9415    False
9416    False
9417    False
9418    False
9419    False
Name: description, Length: 9420, dtype: bool

In [21]:
df['description']

0       spanning the years 1945 to 1955 a chronicle of...
1       framed in the 1940s for the double murder of h...
2       in the continuing saga of the corleone crime f...
3       raj is a rich carefree happygolucky second gen...
4       the true story of how businessman oskar schind...
                              ...                        
9415    sue ann is a loner who keeps to herself in her...
9416    a mother and daughter move to a new town and f...
9417    when mankind beams a radio signal into space a...
9418    mr wilsons everpresent annoyance comes in the ...
9419    a struggling songwriter named dave seville fin...
Name: description, Length: 9420, dtype: object

#### Spell Correction

In [22]:
import pandas as pd
from textblob import TextBlob
import time

start_time = time.time()
df['description'] = pd.DataFrame(df['description'])

def check_spelling_mistakes(text):
    blob = TextBlob(text)
    return str(blob.correct())

df['description'] = df['description'].apply(check_spelling_mistakes)
df

end_time = time.time()

total_time = end_time - start_time
print(f'Total run time: {total_time} seconds')
# around 38  minitues Apple M2 2023

Total run time: 2335.492346763611 seconds


In [23]:
df

Unnamed: 0,name,description,genre
0,The Godfather,spanning the years 1945 to 1955 a chronicle of...,"Drama, Crime"
1,The Shawshank Redemption,framed in the 1940s for the double murder of h...,"Drama, Crime"
2,The Godfather Part II,in the continuing sage of the corleone crime f...,"Drama, Crime"
3,Dilwale Dulhania Le Jayenge,ran is a rich carefree happygolucky second gen...,"Comedy, Drama, Romance"
4,Schindler's List,the true story of how businessman oscar schind...,"Drama, History, War"
...,...,...,...
9415,Ma,sue ann is a longer who keeps to herself in he...,"Horror, Thriller"
9416,House at the End of the Street,a mother and daughter move to a new town and f...,"Horror, Thriller"
9417,Battleship,when mankind beams a radio signal into space a...,"Thriller, Action, Adventure, Science Fiction"
9418,Dennis the Menace,mr wilson everpresent annoyance comes in the f...,"Family, Comedy"


#### Remove stopwords

In [24]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def check_stop_words(text):
    words = text.split()
    stop_word_found = [word for word in words if word.lower() in stop_words]
    return bool(stop_word_found)

df['Stop_Words_Present'] = df['description'].apply(check_stop_words)

In [25]:
df.drop('Stop_Words_Present', axis=1, inplace=True)

In [26]:
def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)
df['description'] = df['description'].apply(remove_stop_words)

In [27]:
df

Unnamed: 0,name,description,genre
0,The Godfather,spanning years 1945 1955 chronicle sectional i...,"Drama, Crime"
1,The Shawshank Redemption,framed 1940s double murder wife lover standing...,"Drama, Crime"
2,The Godfather Part II,continuing sage corleone crime family young ve...,"Drama, Crime"
3,Dilwale Dulhania Le Jayenge,ran rich carefree happygolucky second generati...,"Comedy, Drama, Romance"
4,Schindler's List,true story businessman oscar schindler saved t...,"Drama, History, War"
...,...,...,...
9415,Ma,sue ann longer keeps quiet ohio town one day a...,"Horror, Thriller"
9416,House at the End of the Street,mother daughter move new town find living next...,"Horror, Thriller"
9417,Battleship,mankind beams radio signal space reply comes ‘...,"Thriller, Action, Adventure, Science Fiction"
9418,Dennis the Menace,mr wilson everpresent annoyance comes form one...,"Family, Comedy"


#### Remove Emoji

In [28]:
import re
import pandas as pd

def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df['description'] = df['description'].apply(remove_emojis)

#### Tokenizing

In [30]:
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

df['Tokens'] = df['description'].apply(tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marufbillah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
df['Tokens']

0       [spanning, years, 1945, 1955, chronicle, secti...
1       [framed, 1940s, double, murder, wife, lover, s...
2       [continuing, sage, corleone, crime, family, yo...
3       [ran, rich, carefree, happygolucky, second, ge...
4       [true, story, businessman, oscar, schindler, s...
                              ...                        
9415    [sue, ann, longer, keeps, quiet, ohio, town, o...
9416    [mother, daughter, move, new, town, find, livi...
9417    [mankind, beams, radio, signal, space, reply, ...
9418    [mr, wilson, everpresent, annoyance, comes, fo...
9419    [struggling, songwriter, named, dave, seville,...
Name: Tokens, Length: 9420, dtype: object

#### Stemming

In [32]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

porter = PorterStemmer()
snowball = SnowballStemmer('english')
lancaster = LancasterStemmer()

def porter_stem(text):
    porter_stammed = [porter.stem(word) for word in text]
    return porter_stammed

df['Porter_Stemmed'] = df['description'].apply(porter_stem)

In [33]:
df['Porter_Stemmed']

0       [s, p, a, n, n, i, n, g,  , y, e, a, r, s,  , ...
1       [f, r, a, m, e, d,  , 1, 9, 4, 0, s,  , d, o, ...
2       [c, o, n, t, i, n, u, i, n, g,  , s, a, g, e, ...
3       [r, a, n,  , r, i, c, h,  , c, a, r, e, f, r, ...
4       [t, r, u, e,  , s, t, o, r, y,  , b, u, s, i, ...
                              ...                        
9415    [s, u, e,  , a, n, n,  , l, o, n, g, e, r,  , ...
9416    [m, o, t, h, e, r,  , d, a, u, g, h, t, e, r, ...
9417    [m, a, n, k, i, n, d,  , b, e, a, m, s,  , r, ...
9418    [m, r,  , w, i, l, s, o, n,  , e, v, e, r, p, ...
9419    [s, t, r, u, g, g, l, i, n, g,  , s, o, n, g, ...
Name: Porter_Stemmed, Length: 9420, dtype: object

#### Lemmatization

In [34]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)

df['Lemmatized'] = df['description'].apply(lemmatize)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/marufbillah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/marufbillah/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [35]:
df['Lemmatized']

0       spanning year 1945 1955 chronicle sectional it...
1       framed 1940s double murder wife lover standing...
2       continuing sage corleone crime family young ve...
3       ran rich carefree happygolucky second generati...
4       true story businessman oscar schindler saved t...
                              ...                        
9415    sue ann longer keep quiet ohio town one day as...
9416    mother daughter move new town find living next...
9417    mankind beam radio signal space reply come ‘ p...
9418    mr wilson everpresent annoyance come form one ...
9419    struggling songwriter named dave seville find ...
Name: Lemmatized, Length: 9420, dtype: object

In [36]:
df

Unnamed: 0,name,description,genre,Tokens,Porter_Stemmed,Lemmatized
0,The Godfather,spanning years 1945 1955 chronicle sectional i...,"Drama, Crime","[spanning, years, 1945, 1955, chronicle, secti...","[s, p, a, n, n, i, n, g, , y, e, a, r, s, , ...",spanning year 1945 1955 chronicle sectional it...
1,The Shawshank Redemption,framed 1940s double murder wife lover standing...,"Drama, Crime","[framed, 1940s, double, murder, wife, lover, s...","[f, r, a, m, e, d, , 1, 9, 4, 0, s, , d, o, ...",framed 1940s double murder wife lover standing...
2,The Godfather Part II,continuing sage corleone crime family young ve...,"Drama, Crime","[continuing, sage, corleone, crime, family, yo...","[c, o, n, t, i, n, u, i, n, g, , s, a, g, e, ...",continuing sage corleone crime family young ve...
3,Dilwale Dulhania Le Jayenge,ran rich carefree happygolucky second generati...,"Comedy, Drama, Romance","[ran, rich, carefree, happygolucky, second, ge...","[r, a, n, , r, i, c, h, , c, a, r, e, f, r, ...",ran rich carefree happygolucky second generati...
4,Schindler's List,true story businessman oscar schindler saved t...,"Drama, History, War","[true, story, businessman, oscar, schindler, s...","[t, r, u, e, , s, t, o, r, y, , b, u, s, i, ...",true story businessman oscar schindler saved t...
...,...,...,...,...,...,...
9415,Ma,sue ann longer keeps quiet ohio town one day a...,"Horror, Thriller","[sue, ann, longer, keeps, quiet, ohio, town, o...","[s, u, e, , a, n, n, , l, o, n, g, e, r, , ...",sue ann longer keep quiet ohio town one day as...
9416,House at the End of the Street,mother daughter move new town find living next...,"Horror, Thriller","[mother, daughter, move, new, town, find, livi...","[m, o, t, h, e, r, , d, a, u, g, h, t, e, r, ...",mother daughter move new town find living next...
9417,Battleship,mankind beams radio signal space reply comes ‘...,"Thriller, Action, Adventure, Science Fiction","[mankind, beams, radio, signal, space, reply, ...","[m, a, n, k, i, n, d, , b, e, a, m, s, , r, ...",mankind beam radio signal space reply come ‘ p...
9418,Dennis the Menace,mr wilson everpresent annoyance comes form one...,"Family, Comedy","[mr, wilson, everpresent, annoyance, comes, fo...","[m, r, , w, i, l, s, o, n, , e, v, e, r, p, ...",mr wilson everpresent annoyance come form one ...
