In [1]:
# Import necessary libraries
import numpy as np  # NumPy is used for numerical operations
import pandas as pd  # Pandas is used for data manipulation and analysis
import difflib  # This library is used to find close matches in strings
from sklearn.feature_extraction.text import TfidfVectorizer  # This will convert text to numerical format
from sklearn.metrics.pairwise import cosine_similarity  # This will measure the similarity between two numerical data points

# Load the dataset from a CSV file
dataset = pd.read_csv("movies.csv")  # Read the CSV file into a DataFrame called 'dataset'

# Show the first few rows of the dataset to understand its structure
dataset.head()  # Display the first five rows of the dataset

# Check for missing values in the dataset
dataset.isnull().sum()  # Count and display the number of missing values in each column

# Check the shape (rows, columns) of the dataset
dataset.shape  # Display the number of rows and columns in the dataset

# List the features (columns) we want to use for movie recommendations
selected_feature = ["genres", "keywords", "tagline", "cast", "crew", "original_language", "overview"]

# Replace missing values in the selected features with empty strings
for feature in selected_feature:  # Loop through each feature in the selected list
    dataset[feature] = dataset[feature].fillna(" ")  # Fill NaN (missing) values with empty strings

# Combine all selected features into one column for processing
selected_columns = dataset["genres"] + " " + dataset["keywords"] + " " + dataset["tagline"] + " " + dataset["cast"] + " " + dataset["crew"] + " " + dataset["original_language"]

# Create a TF-IDF vectorizer object to convert text into a numerical format
tfidf = TfidfVectorizer()  # Initialize the TF-IDF vectorizer

# Fit the TF-IDF model on the combined features and transform the text into numerical vectors
feature_vector = tfidf.fit_transform(selected_columns)  # Transform the text data into TF-IDF feature vectors

# Print the feature vectors for inspection (optional)
print(feature_vector)

# Calculate the similarity scores between the movies using cosine similarity
similarity = cosine_similarity(feature_vector)  # Compute cosine similarity between all movies
print(similarity)  # Print the similarity scores

# Check the shape of the similarity matrix
similarity.shape  # Display the dimensions of the similarity matrix

# Get the movie name from the user
movie_name = input("Enter your favorite movie name: ")  # Prompt user for their favorite movie

# Create a list of all movie titles from the dataset
list_of_all_movies = dataset["title"].tolist()  # Convert the 'title' column into a list

# Find close matches for the movie name provided by the user
find_close_max = difflib.get_close_matches(movie_name, list_of_all_movies)  # Get similar movie names

# Print the close matches found
for movie in find_close_max:  # Loop through the found close matches
    print(movie)  # Display each close match

# Select the first close match as the most likely movie the user meant
close_match = find_close_max[0]  # Get the first close match

# Print the selected close match
print(close_match)  # Display the chosen close match

# Find the index of the movie with the selected title
index_of_movie = dataset[dataset.title == close_match]["index"].values[0]  # Get the index of the close match

# Alternatively, use iloc for safety
index_of_movie = dataset[dataset.title == close_match].iloc[0]["index"]  # Get the index using iloc

# Get the list of similarity scores for the selected movie
similarity_score = list(enumerate(similarity[index_of_movie]))  # Create a list of tuples (index, score)

# Sort the movies based on their similarity scores in descending order
sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)  # Sort scores

# Print the suggested movies for the user
print("Movies suggested for you: \n")  # Print header for suggestions

# Initialize a counter to limit the number of suggestions
i = 1  # Start a counter at 1

# Loop through the sorted list of similar movies
for movie in sorted_similar_movies:  # Iterate through each movie and its similarity score
    index = movie[0]  # Get the index of the movie
    title_from_index = dataset[dataset.index == index]["title"].values[0]  # Get the title of the movie using the index

    # Check if the counter is less than 21 to limit the output
    if i < 21:  # Only suggest up to 20 movies
        print(i, ".", title_from_index)  # Print the counter and the movie title
        i += 1  # Increment the counter by 1

# Second part of the movie recommendation system
# Get another movie name from the user
movie_name1 = input('Enter your favorite movie name: ')  # Prompt for another favorite movie

# Create a list of all movie titles again
list_of_all_titles1 = dataset['title'].tolist()  # Convert the 'title' column into a list

# Find close matches for the new movie name provided by the user
find_close_match1 = difflib.get_close_matches(movie_name1, list_of_all_titles1)  # Get similar movie names

# Select the first close match as the most likely movie
close_match1 = find_close_match1[0]  # Get the first close match

# Find the index of the movie with the selected title
index_of_the_movie1 = dataset[dataset.title == close_match1]['index'].values[0]  # Get the index of the close match

# Get the list of similarity scores for the selected movie
similarity_score1 = list(enumerate(similarity[index_of_the_movie1]))  # Create a list of tuples (index, score)

# Sort the movies based on their similarity scores in descending order
sorted_similar_movies1 = sorted(similarity_score1, key=lambda x: x[1], reverse=True)  # Sort scores

# Print the suggested movies for the user based on the second input
print('Movies suggested for you: \n')  # Print header for suggestions

# Initialize a counter to limit the number of suggestions
i = 1  # Start a counter at 1

# Loop through the sorted list of similar movies
for movie in sorted_similar_movies1:  # Iterate through each movie and its similarity score
    index1 = movie[0]  # Get the index of the movie
    title_from_index1 = dataset[dataset.index == index1]['title'].values[0]  # Get the title of the movie using the index
    if (i < 30):  # Only suggest up to 30 movies
        print(i, '.', title_from_index1)  # Print the counter and the movie title
        i += 1  # Increment the counter by 1
