# Import Libraries

In [11]:
import pandas as pd
import sqlite3 as sq
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from pickle import dump

# Import Data

In [12]:
tmdb_5000_movies = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv')
tmdb_5000_credits = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv')

# Export to the directory as .csv
tmdb_5000_movies.to_csv('/workspaces/alfonsoMG_KNN/data/raw/tmdb_5000_movies.csv', index= False)
tmdb_5000_credits.to_csv('/workspaces/alfonsoMG_KNN/data/raw/tmdb_5000_credits.csv', index= False)

# Create Database

In order to construct our dataset, we possess two distinct .csv files. These files will be amalgamated through a join operation within an SQL database, combining their relevant information. Following the successful execution of this process, the consolidated dataset will be extracted and formatted as a dataframe using the pandas library. This dataframe structure is crucial as it provides a tabular representation of the data, enabling us to effectively manipulate and utilize it in the implementation of our K-Nearest Neighbors (KNN) machine learning model for movie recommendations.

In [13]:
# Establish a connection to the SQLite database
conn = sq.connect("../data/raw/movies_database.db")

# Write tmdb_5000_movies DataFrame to the "movies_table" in the database
tmdb_5000_movies.to_sql("movies_table", conn, if_exists="replace", index=False)

# Write tmdb_5000_credits DataFrame to the "credits_table" in the database
tmdb_5000_credits.to_sql("credits_table", conn, if_exists="replace", index=False)

# SQL query to perform INNER JOIN on the "title" column
query = """
    SELECT *
    FROM movies_table
    INNER JOIN credits_table 
    USING (title)
"""

# Execute the query and read the result into a DataFrame
df = pd.read_sql_query(query, conn)

# Display the resulting DataFrame
df


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4804,220000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",,9367,"[{""id"": 5616, ""name"": ""united states\u2013mexi...",es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,"[{""name"": ""Columbia Pictures"", ""id"": 5}]",...,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238,9367,"[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...","[{""credit_id"": ""52fe44eec3a36847f80b280b"", ""de..."
4805,9000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",,72766,[],en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],...,85.0,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5,72766,"[{""cast_id"": 1, ""character"": ""Buzzy"", ""credit_...","[{""credit_id"": ""52fe487dc3a368484e0fb013"", ""de..."
4806,0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",http://www.hallmarkchannel.com/signedsealeddel...,231617,"[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...",en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,"[{""name"": ""Front Street Pictures"", ""id"": 3958}...",...,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Signed, Sealed, Delivered",7.0,6,231617,"[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de..."
4807,0,[],http://shanghaicalling.com/,126186,[],en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],...,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7,126186,"[{""cast_id"": 3, ""character"": ""Sam"", ""credit_id...","[{""credit_id"": ""52fe4ad9c3a368484e16a36b"", ""de..."


We will proceed to filter the dataset based on the columns that are pertinent and potentially influential for our purpose, which is to construct a movie recommender system.

In [14]:
# Specify the columns to keep in the DataFrame
columns = [
    "movie_id",
    "title",
    "overview",
    "genres",
    "keywords",
    "cast",
    "crew"
]

# Select only the specified columns
df = df[columns]

# Print information for a specific row (index 1) for the selected columns
print(f"Movie ID: {df['movie_id'][1]}")
print(f"Title: {df['title'][1]}")
print(f"Overview: {df['overview'][1]}")
print(f"Genres: {df['genres'][1]}")
print(f"Keywords: {df['keywords'][1]}")
print(f"Cast: {df['cast'][1]}")
print(f"Crew: {df['crew'][1]}")


Movie ID: 285
Title: Pirates of the Caribbean: At World's End
Overview: Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems.
Genres: [{"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 28, "name": "Action"}]
Keywords: [{"id": 270, "name": "ocean"}, {"id": 726, "name": "drug abuse"}, {"id": 911, "name": "exotic island"}, {"id": 1319, "name": "east india trading company"}, {"id": 2038, "name": "love of one's life"}, {"id": 2052, "name": "traitor"}, {"id": 2580, "name": "shipwreck"}, {"id": 2660, "name": "strong woman"}, {"id": 3799, "name": "ship"}, {"id": 5740, "name": "alliance"}, {"id": 5941, "name": "calypso"}, {"id": 6155, "name": "afterlife"}, {"id": 6211, "name": "fighter"}, {"id": 12988, "name": "pirate"}, {"id": 157186, "name": "swashbuckler"}, {"id": 179430, "name": "aftercreditsstinger"}]
Cast: [{"cast_id": 4, "character": "Captain Jac

In [15]:
# Define a function to convert json "name" into object
def get_names_from_json(json_list_str, key='name', max_count=None):
    json_list = json.loads(json_list_str)
    names = [item[key] for item in json_list if isinstance(item, dict) and key in item]
    return ', '.join(names[:max_count]) if names else None

# Define a funtion to get the second "name" of the json and convert into object
def get_second_name_from_json(json_list_str, key='name'):
    json_list = json.loads(json_list_str)
    names = [item[key] for item in json_list if isinstance(item, dict) and key in item]
    return names[1] if len(names) >= 2 else None

# Apply the functions to the corresponding columns
df['genres'] = df['genres'].apply(get_names_from_json)
df['keywords'] = df['keywords'].apply(get_names_from_json)
df['cast'] = df['cast'].apply(get_names_from_json, max_count=3)
df['crew'] = df['crew'].apply(get_second_name_from_json)
df["overview"] = df["overview"].tolist()

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['genres'] = df['genres'].apply(get_names_from_json)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['keywords'] = df['keywords'].apply(get_names_from_json)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cast'] = df['cast'].apply(get_names_from_json, max_count=3)
A value is trying to be set

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action, Adventure, Fantasy, Science Fiction","culture clash, future, space war, space colony...","Sam Worthington, Zoe Saldana, Sigourney Weaver",Rick Carter
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","Adventure, Fantasy, Action","ocean, drug abuse, exotic island, east india t...","Johnny Depp, Orlando Bloom, Keira Knightley",Gore Verbinski
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"Action, Adventure, Crime","spy, based on novel, secret agent, sequel, mi6...","Daniel Craig, Christoph Waltz, Léa Seydoux",Sam Mendes
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"Action, Crime, Drama, Thriller","dc comics, crime fighter, terrorist, secret id...","Christian Bale, Michael Caine, Gary Oldman",Charles Roven
4,49529,John Carter,"John Carter is a war-weary, former military ca...","Action, Adventure, Science Fiction","based on novel, mars, medallion, space travel,...","Taylor Kitsch, Lynn Collins, Samantha Morton",Andrew Stanton
...,...,...,...,...,...,...,...
4804,9367,El Mariachi,El Mariachi just wants to play his guitar and ...,"Action, Crime, Thriller","united states–mexico barrier, legs, arms, pape...","Carlos Gallardo, Jaime de Hoyos, Peter Marquardt",Robert Rodriguez
4805,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...,"Comedy, Romance",,"Edward Burns, Kerry Bishé, Marsha Dietlein",Edward Burns
4806,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...","Comedy, Drama, Romance, TV Movie","date, love at first sight, narration, investig...","Eric Mabius, Kristin Booth, Crystal Lowe",Harvey Kahn
4807,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,,,"Daniel Henney, Eliza Coupe, Bill Paxton",Daniel Hsia


In [16]:
# Concatenate values from multiple columns to create the "tags" column
df["tags"] = df.apply(
    lambda row: ', '.join(map(str, [row["genres"], row["keywords"], row["cast"], row["crew"], row["overview"]])),
    axis=1
)

# Convert the "tags" column to a string
df["tags"] = df["tags"].astype(str)

# Replace commas with spaces in the "tags" column
df["tags"] = df["tags"].str.replace(',', " ")

# Display the modified "tags" column for the first few entries
df["tags"].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tags"] = df.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tags"] = df["tags"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tags"] = df["tags"].str.replace(',', " ")


0    Action  Adventure  Fantasy  Science Fiction  c...
1    Adventure  Fantasy  Action  ocean  drug abuse ...
2    Action  Adventure  Crime  spy  based on novel ...
3    Action  Crime  Drama  Thriller  dc comics  cri...
4    Action  Adventure  Science Fiction  based on n...
Name: tags, dtype: object

In [17]:
# Drop specified columns from the DataFrame
total_data = df.drop(columns=["overview", "genres", "keywords", "cast", "crew"])

# Display the first few rows of the modified DataFrame
total_data.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,Action Adventure Fantasy Science Fiction c...
1,285,Pirates of the Caribbean: At World's End,Adventure Fantasy Action ocean drug abuse ...
2,206647,Spectre,Action Adventure Crime spy based on novel ...
3,49026,The Dark Knight Rises,Action Crime Drama Thriller dc comics cri...
4,49529,John Carter,Action Adventure Science Fiction based on n...


In [18]:
# Export the final dataframe to .csv to directory
total_data.to_csv("/workspaces/alfonsoMG_KNN/data/processed/tag_data.csv", index = False)

# Model Training

After the careful selection of features and the cleansing of our dataset, the subsequent phase entails a crucial process known as vectorization. Vectorization involves transforming the textual or categorical data into a numerical format that is suitable for machine learning algorithms. In the context of a movie recommender system, this typically involves converting movie-related information, such as genres, ratings, or any other relevant features, into a numerical representation.

The vectorized data essentially creates a structured numerical input that can be utilized to train our machine learning model. This transformation is essential because most machine learning algorithms, including K-Nearest Neighbors (KNN), require numerical input for training.

Commonly, techniques like one-hot encoding or embedding are employed for vectorization, depending on the nature of the data. Once the dataset is successfully vectorized, it is ready to be used as input for the training of our KNN model, enabling it to learn patterns and relationships within the data for the subsequent movie recommendations.

In [19]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(total_data["tags"])

# K-nearest neighbors model
model = NearestNeighbors(n_neighbors=6, algorithm="brute", metric="cosine")
model.fit(tfidf_matrix)

# Function to get movie recommendations
def get_movie_recommendations(movie_title):
    movie_index = total_data[total_data["title"] == movie_title].index[0]
    distances, indices = model.kneighbors(tfidf_matrix[movie_index])
    similar_movies = [(total_data["title"][i], distances[0][j]) for j, i in enumerate(indices[0])]
    return similar_movies[1:]

# Example input movie title
input_movie = "El Mariachi"
recommendations = get_movie_recommendations(input_movie)

# Print movie recommendations
print("Film recommendations for '{}':".format(input_movie))
for movie, distance in recommendations:
    print("- Film: {}, Distance: {}".format(movie, distance))

# Save the trained KNN model to a file
dump(model, open(f"/workspaces/alfonsoMG_KNN/models/knn_model.pk", "wb"))

Film recommendations for 'El Mariachi':
- Film: Desperado, Distance: 0.789186566205919
- Film: Once Upon a Time in Mexico, Distance: 0.8298443514274817
- Film: A Civil Action, Distance: 0.855926168969773
- Film: Traffic, Distance: 0.8776641829265892
- Film: The Man, Distance: 0.8902887811754823


## Conclusion

After experimenting with various movie inputs, it is evident that the movie recommender system effectively provides tailored recommendations, consistently suggesting similar films. In this regard, the recommender system demonstrates its functionality. However, a notable limitation is that the movies inputted must be pre-registered in the database. While this constraint might be intuitive, it's essential to be aware that the recommender system's effectiveness is contingent on the availability of relevant movie data in the underlying dataset. Despite this limitation, the system serves as a valuable tool for users seeking personalized movie suggestions based on their preferences.