In [7]:
# Cell 1: Load Libraries and Data - (The Final Attempt at File Loading)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# IMPORTANT: PASTE YOUR FULL, COPIED PATHS HERE.
# Use the 'r' prefix for raw strings to handle the backslashes correctly.
DATA_PATH_RATING = r'C:\Users\5745o\OneDrive\Desktop\Project\movie-recommendation-system\data\ml-100k\u.data'
DATA_PATH_MOVIE = r'C:\Users\5745o\OneDrive\Desktop\Project\movie-recommendation-system\data\ml-100k\u.item'

try:
    # Load ratings data (u.data)
    ratings = pd.read_csv(
        DATA_PATH_RATING, 
        sep='\t', 
        names=['user_id', 'movie_id', 'rating', 'timestamp']
    )
    
    # Load movie metadata (u.item)
    movies = pd.read_csv(
        DATA_PATH_MOVIE, 
        sep='|', 
        encoding='latin-1', 
        usecols=[0, 1], 
        names=['movie_id', 'title']
    )
    
    print("SUCCESS! Data loaded.")
    print(f"Total Ratings Loaded: {len(ratings)}")
    display(ratings.head())

except FileNotFoundError:
    print("CRITICAL ERROR: File not found. The copied path is incorrect or the file name is wrong.")
    print(f"Attempted path for ratings: {DATA_PATH_RATING}")
except Exception as e:
    print(f"An unexpected error occurred during file reading: {e}")

SUCCESS! Data loaded.
Total Ratings Loaded: 100000


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [8]:
# Cell 2: Merging Data (Completes Week 2)

merged_data = pd.merge(ratings, movies, on='movie_id')
df_cleaned = merged_data.drop(columns=['timestamp'])
print("Data merged successfully.")
display(df_cleaned.head())

Data merged successfully.


Unnamed: 0,user_id,movie_id,rating,title
0,196,242,3,Kolya (1996)
1,186,302,3,L.A. Confidential (1997)
2,22,377,1,Heavyweights (1994)
3,244,51,2,Legends of the Fall (1994)
4,166,346,1,Jackie Brown (1997)


In [9]:
# Cell 3: Feature Engineering - Creating the User-Item Matrix (Completes Week 3)

# Pivot the table: Rows=User ID, Columns=Movie Title, Values=Rating
# Fill NaN values with 0 to represent unrated movies
user_item_matrix = df_cleaned.pivot_table(
    index='user_id', 
    columns='title', 
    values='rating'
).fillna(0)

print("User-Item Matrix created.")
print(f"Matrix Shape (Users x Movies): {user_item_matrix.shape}")

User-Item Matrix created.
Matrix Shape (Users x Movies): (943, 1664)


In [11]:
# Cell 4: Model Building - Item-Item Collaborative Filtering (Completes Week 4)

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Convert the pandas matrix to a sparse matrix for memory efficiency
item_matrix_sparse = csr_matrix(user_item_matrix)

# Calculate the cosine similarity between all pairs of movies. 
# This is the core of the recommendation model.
item_similarity = cosine_similarity(item_matrix_sparse.T)

# Convert the similarity array into a readable DataFrame
item_similarity_df = pd.DataFrame(
    item_similarity, 
    index=user_item_matrix.columns, 
    columns=user_item_matrix.columns
)

print("Model built: Item Similarity Matrix calculated.")

Model built: Item Similarity Matrix calculated.


In [12]:
# Cell 5: Recommendation Function and Test (Completes Week 5)

def get_movie_recommendations(movie_title, similarity_df, n_recommendations=5):
    """
    Returns the top N movie recommendations based on similarity score.
    """
    try:
        movie_similarities = similarity_df[movie_title]
        top_similar_movies = movie_similarities.sort_values(ascending=False)[1:n_recommendations+1]
        
        print(f"\n--- Top {n_recommendations} Recommendations for '{movie_title}' ---")
        display(top_similar_movies.to_frame(name='Similarity Score'))
        
    except KeyError:
        print(f"Error: Movie '{movie_title}' not found. Try 'Star Wars (1977)' or 'Fargo (1996)'.")

# --- Final Test ---
get_movie_recommendations('Star Wars (1977)', item_similarity_df, n_recommendations=5)


--- Top 5 Recommendations for 'Star Wars (1977)' ---


Unnamed: 0_level_0,Similarity Score
title,Unnamed: 1_level_1
Return of the Jedi (1983),0.884476
Raiders of the Lost Ark (1981),0.764885
"Empire Strikes Back, The (1980)",0.749819
Toy Story (1995),0.734572
"Godfather, The (1972)",0.697332


In [13]:
# Cell 6: Evaluation - Sparsity and Metrics Discussion

# Calculate the sparsity of the User-Item Matrix
# Sparsity = (Number of zero ratings) / (Total possible ratings)
ratings_count = user_item_matrix.shape[0] * user_item_matrix.shape[1]
zero_ratings = (user_item_matrix == 0).sum().sum()

sparsity = (zero_ratings / ratings_count) * 100

print(f"Total possible ratings (Users x Movies): {ratings_count}")
print(f"Number of zero ratings (Unrated movies): {zero_ratings}")
print(f"Data Sparsity: {sparsity:.2f}%")
print("\nConclusion: The high sparsity shows why predicting ratings is difficult and collaboration is needed.")

Total possible ratings (Users x Movies): 1569152
Number of zero ratings (Unrated movies): 1469459
Data Sparsity: 93.65%

Conclusion: The high sparsity shows why predicting ratings is difficult and collaboration is needed.


In [14]:
# Cell 7: Final Recommendation Output

# Run your test one last time for your final result
get_movie_recommendations('Star Wars (1977)', item_similarity_df, n_recommendations=5)


--- Top 5 Recommendations for 'Star Wars (1977)' ---


Unnamed: 0_level_0,Similarity Score
title,Unnamed: 1_level_1
Return of the Jedi (1983),0.884476
Raiders of the Lost Ark (1981),0.764885
"Empire Strikes Back, The (1980)",0.749819
Toy Story (1995),0.734572
"Godfather, The (1972)",0.697332
