In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('imdb_movies.csv')
print("total rows: ", len(df))
df.head()

total rows:  10178


Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


In [3]:
df.dropna(subset=['names','overview'], inplace=True)
df.shape

(10178, 12)

In [4]:
df.head(1)

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616668.0,AU


In [5]:
df['text'] = df['names']+' '+df['overview']
df['text']

0        Creed III After dominating the boxing world, A...
1        Avatar: The Way of Water Set more than a decad...
2        The Super Mario Bros. Movie While working unde...
3        Mummies Through a series of unfortunate events...
4        Supercell Good-hearted teenager William always...
                               ...                        
10173    20th Century Women In 1979 Santa Barbara, Cali...
10174    Delta Force 2: The Colombian Connection When D...
10175    The Russia House Barley Scott Blair, a Lisbon-...
10176    Darkman II: The Return of Durant Darkman and D...
10177    The Swan Princess: A Royal Wedding Princess Od...
Name: text, Length: 10178, dtype: object

In [6]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
vectorizer = TfidfVectorizer(max_features=10000,
                            stop_words='english',
                            lowercase=True,
                            tokenizer= word_tokenize,)
X = vectorizer.fit_transform(df['text'])
X



<10178x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 254479 stored elements in Compressed Sparse Row format>

In [8]:
# cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(X, X)
sim.shape

(10178, 10178)

In [9]:
from joblib import dump, load
import os
os.makedirs('models', exist_ok=True)
dump(vectorizer, 'models/vectorizer.joblib')
dump(sim, 'models/similarity.joblib')
df.to_parquet('models/clean_movies.parquet')

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [None]:
def get_id_from_movie(movie_name, df):
    try:return df[df['names'].str.lower()==movie_name.lower()].index.tolist()[0]
    except: return -1

def get_random_movie_from_keyword(keyword, df):
    try: return df[df['overview'].str.lower().str.contains(keyword.lower())].sample(1).index.tolist()[0]
    except: return -1

In [None]:
def get_recommendation(query='', by='name', count=10):
    df = pd.read_parquet('models/clean_movies.parquet')
    sim = load('models/similarity.joblib')
    match by:
        case 'name':
            movie_id = get_id_from_movie(query, df)
            if movie_id == -1:
                return 'Movie not found'
            else:
                sim_scores = list(enumerate(sim[movie_id]))
                sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
                sim_scores = sim_scores[count+1::-1]
                movie_indices = [i[0] for i in sim_scores]
                return df['names'].iloc[movie_indices].tolist()
        case 'word':
            movie_ids = get_random_movie_from_keyword(query, df)
            if movie_ids == -1:
                return 'Movie not found'
            else:
                sim_scores = list(enumerate(sim[movie_ids]))
                sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
                sim_scores = sim_scores[count+1::-1]
                movie_indices = [i[0] for i in sim_scores]
                return df['names'].iloc[movie_indices].tolist()

In [None]:
get_recommendation('batman', count=10)

['The Dark Knight',
 'Batman: The Long Halloween, Part Two',
 'Batman: Bad Blood',
 'Batman Begins',
 'The Batman',
 'The Batman vs. Dracula',
 'Batman: Mystery of the Batwoman',
 'Batman: Gotham by Gaslight',
 "Batman Unmasked: The Psychology of 'The Dark Knight'",
 'Batman Beyond: Return of the Joker',
 'Batman Returns',
 'Batman']

In [None]:
get_recommendation('Kung Fu', by='word', count=10)

['Ip Man: Kung Fu Master',
 'Monk Comes Down the Mountain',
 'House Party',
 'Kung Fu Dunk',
 'Jackie Chan Kung Fu Master',
 'Kung Fu Panda',
 'Kung Fu Hustle',
 "Grosso guaio all'Esquilino - La leggenda del Kung Fu",
 'Kung Fu Panda 2',
 'Bulletproof Monk',
 'The Karate Kid',
 'The Karate Kid']

In [None]:
import requests
import json
def movie_data_from_tmdb(movie_name):
    url = f"https://api.themoviedb.org/3/search/movie?query={movie_name}&include_adult=false&language=en-US&page=1"
    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJlMjBjMGRlMTU4ZTllYmE1ZjViMDQ1YWFkMmVjYTA3NSIsIm5iZiI6MTcyNDY1Mjk1MC45MzczNDUsInN1YiI6IjVlZTlkYzNlMTY4NWRhMDAzNjI5ODc1ZCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.VQS6z9TVtiem10Ev-1qhecdTEkl0BxpatxEBHoq7KEw"
    }
    response = requests.get(url, headers=headers)
    # take the first result
    movie_id = response.json()['results'][0]['id']
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US"
    response = requests.get(url, headers=headers)
    # convert 
    movie_data = response.json()
    poster = f"https://image.tmdb.org/t/p/w500{movie_data['poster_path']}"
    genres = [i['name'] for i in movie_data['genres']]
    link = movie_data['homepage']
    imdb_id = movie_data['imdb_id']
    overview = movie_data['overview']
    return {
        'movie': movie_name,
        'poster': poster,
        'genres': genres,
        'link': link,
        'imdb_id': imdb_id,
        'overview': overview
    }

movie_data_from_tmdb('Avengers')


{'movie': 'Avengers',
 'poster': 'https://image.tmdb.org/t/p/w500/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg',
 'genres': ['Adventure', 'Action', 'Science Fiction'],
 'link': 'https://www.marvel.com/movies/avengers-infinity-war',
 'imdb_id': 'tt4154756',
 'overview': 'As the Avengers and their allies have continued to protect the world from threats too large for any one hero to handle, a new danger has emerged from the cosmic shadows: Thanos. A despot of intergalactic infamy, his goal is to collect all six Infinity Stones, artifacts of unimaginable power, and use them to inflict his twisted will on all of reality. Everything the Avengers have fought for has led up to this moment - the fate of Earth and existence itself has never been more uncertain.'}