In [1]:
import pandas as pd
import numpy as np


In [2]:
import pandas as pd
import numpy as np

# Load ratings data
ratings = pd.read_csv("u.data", sep="\t", names=["userId", "movieId", "rating", "timestamp"])

# Load movie data
movies = pd.read_csv("u.item", sep="|", encoding="latin-1", usecols=[0, 1], names=["movieId", "title"])

# Load genres (optional)
genres = pd.read_csv("u.genre", sep="|", names=["genre", "genreId"], encoding="latin-1")

# Display the first few rows
print("Ratings Data:")
display(ratings.head())

print("Movies Data:")
display(movies.head())

print("Genres Data:")
display(genres.head())


Ratings Data:


Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


Movies Data:


Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


Genres Data:


Unnamed: 0,genre,genreId
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4


In [3]:
# Merge ratings with movies on movieId
movie_data = pd.merge(ratings, movies, on="movieId")

# Show the merged dataset
movie_data.head()


Unnamed: 0,userId,movieId,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [5]:
# Copy movie dataset (remove duplicates)
movie_features = movies.copy()

# Create a Count Vectorizer for movie titles
cv = CountVectorizer(stop_words="english")
title_matrix = cv.fit_transform(movie_features['title'])

# Calculate cosine similarity
similarity = cosine_similarity(title_matrix)


In [6]:
def recommend_movie(movie_name):
    # Check if movie exists
    if movie_name not in movie_features['title'].values:
        print("Movie not found in the dataset!")
        return

    # Get index of the movie
    index = movie_features[movie_features['title'] == movie_name].index[0]
    
    # Get similarity scores
    distances = list(enumerate(similarity[index]))
    
    # Sort movies by similarity (top 5 excluding the movie itself)
    movies_list = sorted(distances, key=lambda x: x[1], reverse=True)[1:6]
    
    print(f"Top 5 recommendations for '{movie_name}':\n")
    for i in movies_list:
        print(movie_features.iloc[i[0]].title)


In [7]:
recommend_movie("Toy Story (1995)")


Top 5 recommendations for 'Toy Story (1995)':

Now and Then (1995)
Pyromaniac's Love Story, A (1995)
Show, The (1995)
To Have, or Not (1995)
GoldenEye (1995)


In [8]:
# Reload movie data with genre columns
genre_cols = [
    "movieId", "title", "unknown", "Action", "Adventure", "Animation",
    "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
    "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
    "Thriller", "War", "Western"
]

movies_full = pd.read_csv("u.item", sep="|", encoding="latin-1", names=genre_cols, usecols=range(21))
movies_full.head()


Unnamed: 0,movieId,title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0


In [9]:
# Combine genres into a single string
def get_genres(row):
    genres_list = []
    for col in genre_cols[2:]:  # Skip movieId and title
        if row[col] == 1:
            genres_list.append(col)
    return " ".join(genres_list)

movies_full["genres"] = movies_full.apply(get_genres, axis=1)
movies_full[["title", "genres"]].head()


Unnamed: 0,title,genres
0,Toy Story (1995),Crime Documentary Drama
1,GoldenEye (1995),Children's Comedy
2,Four Rooms (1995),
3,Get Shorty (1995),Children's Drama Horror
4,Copycat (1995),Fantasy Horror


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a Count Vectorizer for genres
cv = CountVectorizer(stop_words="english")
vector = cv.fit_transform(movies_full["genres"])

# Compute cosine similarity
similarity = cosine_similarity(vector)

# Recommendation function
def recommend_movie(movie_name):
    if movie_name not in movies_full['title'].values:
        print("Movie not found in the dataset!")
        return

    index = movies_full[movies_full['title'] == movie_name].index[0]
    distances = list(enumerate(similarity[index]))
    movies_list = sorted(distances, key=lambda x: x[1], reverse=True)[1:6]

    print(f"Top 5 recommendations for '{movie_name}':\n")
    for i in movies_list:
        print(movies_full.iloc[i[0]].title)


In [11]:
recommend_movie("Toy Story (1995)")


Top 5 recommendations for 'Toy Story (1995)':

Aladdin and the King of Thieves (1996)
Goofy Movie, A (1995)
Santa Clause, The (1994)
Home Alone (1990)
Aristocats, The (1970)


In [14]:
# Create pivot table (rows: users, columns: movies)
user_movie_matrix = movie_data.pivot_table(index="userId", columns="title", values="rating")
user_movie_matrix.head()


title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


In [15]:
from sklearn.metrics.pairwise import cosine_similarity

# Fill NaN with 0 for similarity calculation
user_movie_matrix_filled = user_movie_matrix.fillna(0)

# Compute cosine similarity between users
user_similarity = cosine_similarity(user_movie_matrix_filled)

# Convert to DataFrame
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)
user_similarity_df.head()


userId,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.168937,0.048388,0.064561,0.37967,0.429682,0.443097,0.320079,0.078385,0.377733,...,0.372213,0.11986,0.26986,0.193343,0.197949,0.118722,0.315064,0.149086,0.181612,0.399432
2,0.168937,1.0,0.113393,0.179694,0.073623,0.242106,0.108604,0.104257,0.16247,0.161273,...,0.147095,0.310661,0.363328,0.410725,0.322713,0.231096,0.228793,0.162911,0.175273,0.106732
3,0.048388,0.113393,1.0,0.349781,0.021592,0.074018,0.067423,0.084419,0.062039,0.066217,...,0.033885,0.043453,0.16714,0.071288,0.126278,0.026758,0.164539,0.102899,0.136757,0.02699
4,0.064561,0.179694,0.349781,1.0,0.031804,0.068431,0.091507,0.18806,0.101284,0.060859,...,0.054615,0.036784,0.133619,0.196561,0.146058,0.030202,0.196858,0.152041,0.171538,0.058752
5,0.37967,0.073623,0.021592,0.031804,1.0,0.238636,0.374733,0.24893,0.056847,0.201427,...,0.340183,0.08058,0.095284,0.081053,0.148607,0.071612,0.239955,0.139595,0.153799,0.313941


In [16]:
def recommend_for_user(user_id, top_n=5):
    # Get similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]  # Exclude self
    
    recommended_movies = {}

    for similar_user in similar_users:
        user_ratings = user_movie_matrix.loc[similar_user].dropna()
        for movie, rating in user_ratings.items():
            if movie not in user_movie_matrix.loc[user_id] or pd.isna(user_movie_matrix.loc[user_id, movie]):
                recommended_movies[movie] = recommended_movies.get(movie, 0) + rating
    
    # Sort movies by aggregated score
    recommended_movies = sorted(recommended_movies.items(), key=lambda x: x[1], reverse=True)[:top_n]

    print(f"Top {top_n} recommendations for User {user_id}:\n")
    for movie, score in recommended_movies:
        print(f"{movie} (Score: {score:.2f})")


In [17]:
recommend_for_user(1)


Top 5 recommendations for User 1:

English Patient, The (1996) (Score: 1759.00)
Scream (1996) (Score: 1645.00)
Air Force One (1997) (Score: 1565.00)
Liar Liar (1997) (Score: 1531.00)
Titanic (1997) (Score: 1486.00)


In [18]:
def hybrid_recommend(user_id, movie_name, top_n=5, alpha=0.5):
    """
    Hybrid Recommendation System
    alpha: weight (0 to 1) - 0.5 means equal weight for both methods
    """
    # ---- Content-Based Part ----
    if movie_name not in movies_full['title'].values:
        print("Movie not found in dataset!")
        return

    index = movies_full[movies_full['title'] == movie_name].index[0]
    distances = list(enumerate(similarity[index]))
    content_scores = {movies_full.iloc[i[0]].title: i[1] for i in distances}

    # ---- Collaborative Part ----
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]
    collab_scores = {}

    for similar_user in similar_users:
        user_ratings = user_movie_matrix.loc[similar_user].dropna()
        for movie, rating in user_ratings.items():
            if movie not in user_movie_matrix.loc[user_id] or pd.isna(user_movie_matrix.loc[user_id, movie]):
                collab_scores[movie] = collab_scores.get(movie, 0) + rating

    # ---- Combine Scores ----
    hybrid_scores = {}

    for movie in set(content_scores.keys()).union(collab_scores.keys()):
        content_score = content_scores.get(movie, 0)
        collab_score = collab_scores.get(movie, 0)
        hybrid_scores[movie] = alpha * content_score + (1 - alpha) * collab_score

    # Sort by score
    hybrid_scores = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    print(f"Top {top_n} Hybrid Recommendations for User {user_id} based on '{movie_name}':\n")
    for movie, score in hybrid_scores:
        print(f"{movie} (Score: {score:.2f})")


In [19]:
hybrid_recommend(1, "Toy Story (1995)")


Top 5 Hybrid Recommendations for User 1 based on 'Toy Story (1995)':

English Patient, The (1996) (Score: 879.50)
Scream (1996) (Score: 822.50)
Air Force One (1997) (Score: 782.50)
Liar Liar (1997) (Score: 765.79)
Titanic (1997) (Score: 743.00)
