In [3]:
import pandas as pd
import numpy as np
import sqlite3
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from joblib import dump
import json

In [4]:
url_movies = 'https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv'
url_credits = 'https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv'

movies = pd.read_csv(url_movies)
credits = pd.read_csv(url_credits)

conn = sqlite3.connect('movie_database.db')

movies.to_sql('movies_table', conn, index=False, if_exists='replace')
credits.to_sql('credits_table', conn, index=False, if_exists='replace')

query = '''
    SELECT m.*, c.*
    FROM movies_table m
    JOIN credits_table c ON m.title = c.title
'''

df = pd.read_sql(query, conn)

conn.close()


In [5]:
df['cast'] = df['cast'].apply(lambda x: json.loads(x) if isinstance(x, str) and pd.notnull(x) else x)
df['first_three_actors'] = df['cast'].apply(lambda cast_list: [actor['name'] for actor in cast_list[:3]] if cast_list else [])


In [6]:
# conver json strings to lists of dictionaries
df['crew'] = df['crew'].apply(lambda x: json.loads(x) if isinstance(x, str) and pd.notnull(x) else x)
# extract names of directors
df['director'] = df['crew'].apply(lambda crew_list: next((member['name'] for member in crew_list if member['job'] == 'Director'), None))

In [7]:
df['overview_words'] = df['overview'].apply(lambda x: x.split() if isinstance(x, str) else [])

In [8]:
df['keywords'] = df['keywords'].apply(ast.literal_eval)
df['keywords'] = df['keywords'].apply(lambda x: [keyword['name'] for keyword in x] if x else [])


In [9]:
df['genres'] = df['genres'].apply(ast.literal_eval)
df['genres'] = df['genres'].apply(lambda x: [genre['name'] for genre in x] if x else [])

In [10]:
df['genres'] = df['genres'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
df['first_three_actors'] = df['first_three_actors'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
df['keywords'] = df['keywords'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

In [11]:
df['tags'] = df['overview'] + ' ' + df['genres'] + ' ' + df['first_three_actors'] + ' ' + df['director'] + ' ' + df['keywords']
df['tags'] = df['tags'].str.replace(',', ' ')

In [25]:
df = df.drop_duplicates(subset='title', keep='first')


In [35]:
duplicates = df[df.duplicated(subset='tags', keep=False)]


In [13]:
df.drop(columns = ["genres", "keywords", "cast", "crew", "overview"], inplace = True)


In [19]:
df = df.dropna(subset=['tags'])

In [44]:
df.columns

Index(['budget', 'homepage', 'id', 'original_language', 'original_title',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'status',
       'tagline', 'vote_average', 'vote_count', 'movie_id',
       'first_three_actors', 'director', 'overview_words', 'tags'],
      dtype='object')

In [41]:
df.drop("title", axis=1, inplace=True)

In [45]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["tags"])

model = NearestNeighbors(n_neighbors = 6, algorithm = "brute", metric = "cosine")
model.fit(tfidf_matrix)

def get_movie_recommendations(movie_title):
    movie_index = df[df["original_title"] == movie_title].index[0]
    distances, indices = model.kneighbors(tfidf_matrix[movie_index])
    similar_movies = [(df["original_title"][i], distances[0][j]) for j, i in enumerate(indices[0])]
    return similar_movies[1:]



In [46]:
get_movie_recommendations("Avatar")

[('City of Ghosts', 0.6979426555590698),
 ('Alien³', 0.730287739186475),
 ('Out of Africa', 0.7478753055618057),
 ('Mission to Mars', 0.7493196634595706),
 ('Into the Grizzly Maze', 0.7543915756012284)]

In [47]:
dump(model, open("./models/knn_neighbors.sav", "wb"))