In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
file_path = r'D:\RecommandationSystem\ml-100k\u.data'
movies_path = r'D:\RecommandationSystem\ml-100k\u.item'
columns = ['user id','item_id','rating', 'timestamp']
df = pd.read_csv(file_path, sep='\t', names=columns)
df.head()

Unnamed: 0,user id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
movies_path = r'D:\RecommandationSystem\ml-100k\u.item'

movie_columns = [
    'item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'
] + [f'genre_{i}' for i in range(19)]  

movies_df = pd.read_csv(movies_path, sep='|', names=movie_columns, encoding='latin-1', usecols=['item_id', 'title'])

movies_df.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


### Load Movie Metadata
Read the `u.item` file containing movie information (ID and title) from the MovieLens dataset. Select only the `item_id` and `title` columns for further use.


In [4]:
movie_columns = [
    'item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'
] + [f'genre_{i}' for i in range(19)]  

movies_df = pd.read_csv(movies_path, sep='|', names=movie_columns, encoding='latin-1', usecols=['item_id', 'title'])
user_item_matrix = df.pivot_table(index='user id', columns='item_id', values='rating')

movies_df.head(5)

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
user_item_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [6]:
item_matrix_filled = user_item_matrix.fillna(0)

###  Compute Item-Item Similarity
Calculate pairwise cosine similarity between items based on user ratings:  
- Use `cosine_similarity` on the transposed item matrix (`item_matrix_filled.T`).  
- Create a DataFrame `item_similarity_df` with item IDs as rows and columns.  
- Display the top-left 5x5 block of the similarity matrix for inspection.


In [7]:

from sklearn.metrics.pairwise import cosine_similarity


item_similarity = cosine_similarity(item_matrix_filled.T)  

item_similarity_df = pd.DataFrame(
    item_similarity,
    index=item_matrix_filled.columns,
    columns=item_matrix_filled.columns
)

item_similarity_df.iloc[:5, :5]

item_id,1,2,3,4,5
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,0.402382,0.330245,0.454938,0.286714
2,0.402382,1.0,0.273069,0.502571,0.318836
3,0.330245,0.273069,1.0,0.324866,0.212957
4,0.454938,0.502571,0.324866,1.0,0.334239
5,0.286714,0.318836,0.212957,0.334239,1.0


###  Find Similar Movies
Define a function `get_similar_movies_named()` to retrieve the top `n` movies similar to a given movie based on item-item cosine similarity:  
- Check if the `movie_id` exists in the similarity matrix.  
- Drop the movie itself from the similarity scores.  
- Sort and select the top `n` most similar movie IDs.  
- Retrieve and return their titles from `movies_df` in the same order.  


In [23]:
def get_similar_movies_named(movie_id, n=5):
   
    if movie_id not in item_similarity_df.columns:
        print("The desired movie is not in the dataset.")
        return []

    similar_scores = item_similarity_df[movie_id]
    similar_scores = similar_scores.drop(movie_id)
    top_ids = similar_scores.sort_values(ascending=False).head(n).index

    similar_titles = movies_df[movies_df['item_id'].isin(top_ids)][['item_id', 'title']]
    similar_titles = similar_titles.set_index('item_id').loc[top_ids]  

    return similar_titles

In [24]:
get_similar_movies_named(50, n=5)

Unnamed: 0_level_0,title
item_id,Unnamed: 1_level_1
181,Return of the Jedi (1983)
174,Raiders of the Lost Ark (1981)
172,"Empire Strikes Back, The (1980)"
1,Toy Story (1995)
127,"Godfather, The (1972)"


###  Recommend Movies for a User
Define `recommend_movies_for_user()` to generate movie recommendations for a specific user:  
- Check if the user exists in the dataset.  
- For each unrated movie, predict a rating using weighted averages of similar movies the user has rated.  
- Return the top `n_recommendations` movie titles sorted by predicted rating.


In [None]:
def recommend_movies_for_user(user_id, n_recommendations=5):
    if user_id not in user_item_matrix.index:
        print("The desired movie is not in the dataset.")
        return []

    user_ratings = user_item_matrix.loc[user_id]
    user_rated_movies = user_ratings[user_ratings.notna()].index

    predicted_ratings = {}

    for movie_id in user_item_matrix.columns:
        if movie_id in user_rated_movies:
            continue

        similar_movies = item_similarity_df[movie_id].drop(movie_id)
        similar_movies = similar_movies[similar_movies.index.isin(user_rated_movies)]

        if similar_movies.empty:
            continue

        weights = similar_movies.values
        ratings = user_ratings[similar_movies.index].values

        predicted_rating = np.dot(weights, ratings) / np.sum(weights)
        predicted_ratings[movie_id] = predicted_rating

    recommended_movie_ids = sorted(predicted_ratings, key=predicted_ratings.get, reverse=True)[:n_recommendations]

    recommended_titles = movies_df[movies_df['item_id'].isin(recommended_movie_ids)][['item_id', 'title']]
    recommended_titles = recommended_titles.set_index('item_id').loc[recommended_movie_ids]

    return recommended_titles

In [11]:
recommend_movies_for_user(100, n_recommendations=5)

Unnamed: 0_level_0,title
item_id,Unnamed: 1_level_1
1619,All Things Fair (1996)
1556,Condition Red (1995)
1674,Mamma Roma (1962)
1661,"New Age, The (1994)"
1616,Desert Winds (1995)


###  Load Movies with Genres
Read the full `u.item` file from the MovieLens dataset:  
- Define columns for movie metadata and genre indicators.  
- Load the data into `movie_df_full` and set `item_id` as the index.  
- Extract only genre columns into `movie_genres` for further analysis.  
- Display the first few rows of the genre data.


In [12]:
movie_columns = [
    'item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'
] + [
    'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy',
    'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
    'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

movie_df_full = pd.read_csv(
    movies_path,
    sep='|',
    names=movie_columns,
    encoding='latin-1'
)

movie_df_full.set_index('item_id', inplace=True)

movie_genres = movie_df_full[[
    'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy',
    'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
    'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]]

movie_genres.head()

Unnamed: 0_level_0,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


### Compute Genre-Based Similarity
Calculate cosine similarity between movies based on their genre vectors:  
- Use `cosine_similarity` on `movie_genres` to get pairwise similarities.  
- Store results in `genre_similarity_df` with movie IDs as index and columns.  
- Display the top-left 5x5 block of the similarity matrix.


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

genre_similarity = cosine_similarity(movie_genres)

genre_similarity_df = pd.DataFrame(
    genre_similarity,
    index=movie_genres.index,
    columns=movie_genres.index
)

genre_similarity_df.iloc[:5, :5]

item_id,1,2,3,4,5
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,0.0,0.0,0.333333,0.0
2,0.0,1.0,0.57735,0.333333,0.333333
3,0.0,0.57735,1.0,0.0,0.57735
4,0.333333,0.333333,0.0,1.0,0.333333
5,0.0,0.333333,0.57735,0.333333,1.0


###  Combine Similarities for Hybrid Recommendation
Create a hybrid similarity matrix by weighting item-item similarity and genre-based similarity:  
- Use `alpha` to balance between rating-based and genre-based similarities.  
- Compute weighted sum of the two similarity matrices.  
- Store the result in `hybrid_similarity_df`.  
- Display the first 5x5 block of the hybrid similarity matrix.


In [14]:
alpha = 0.7

hybrid_similarity = alpha * item_similarity_df + (1 - alpha) * genre_similarity_df

hybrid_similarity_df = pd.DataFrame(
    hybrid_similarity.values,
    index=item_similarity_df.index,
    columns=item_similarity_df.columns
)

hybrid_similarity_df.iloc[:5, :5]

item_id,1,2,3,4,5
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,0.281668,0.231171,0.418457,0.200699
2,0.281668,1.0,0.364354,0.4518,0.323185
3,0.231171,0.364354,1.0,0.227406,0.322275
4,0.418457,0.4518,0.227406,1.0,0.333968
5,0.200699,0.323185,0.322275,0.333968,1.0


###  Hybrid Movie Recommendation for a User
Generate movie recommendations by combining user ratings and hybrid similarity matrix:  
- Verify if the user exists in the dataset.  
- Identify movies already rated by the user.  
- For each unrated movie, compute predicted rating using weighted average of similar movies rated by the user (based on `hybrid_similarity_df`).  
- Return top `n_recommendations` movie titles sorted by predicted rating.


In [None]:
def recommend_movies_for_user_hybrid(user_id, n_recommendations=5):
    if user_id not in user_item_matrix.index:
        print("The desired user is not in the datasets.")
        return []

    user_ratings = user_item_matrix.loc[user_id]
    user_rated_movies = user_ratings[user_ratings.notna()].index

    predicted_ratings = {}

    for movie_id in user_item_matrix.columns:
        if movie_id in user_rated_movies:
            continue

        similar_movies = hybrid_similarity_df[movie_id].drop(movie_id)
        similar_movies = similar_movies[similar_movies.index.isin(user_rated_movies)]

        if similar_movies.empty:
            continue

        weights = similar_movies.values
        ratings = user_ratings[similar_movies.index].values

        predicted_rating = np.dot(weights, ratings) / np.sum(weights)
        predicted_ratings[movie_id] = predicted_rating

    recommended_movie_ids = sorted(predicted_ratings, key=predicted_ratings.get, reverse=True)[:n_recommendations]

    recommended_titles = movies_df[movies_df['item_id'].isin(recommended_movie_ids)][['item_id', 'title']]
    recommended_titles = recommended_titles.set_index('item_id').loc[recommended_movie_ids]

    return recommended_titles

In [16]:
recommend_movies_for_user_hybrid(100, n_recommendations=5)

Unnamed: 0_level_0,title
item_id,Unnamed: 1_level_1
1582,T-Men (1947)
1562,"Eye of Vichy, The (Oeil de Vichy, L') (1993)"
1587,Terror in a Texas Town (1958)
1476,Raw Deal (1948)
1064,Crossfire (1947)


###  Hybrid Genre-Based Movie Recommendation with User Filtering
Recommend movies for a user with optional genre filtering:  
- Check if the user exists in the dataset.  
- Retrieve movies the user has already rated.  
- If genres are specified, filter candidate movies by those genres.  
- For each candidate movie not rated by the user, predict rating based on weighted average of similar movies rated by the user, using the hybrid similarity matrix.  
- Return top `n_recommendations` movie titles sorted by predicted rating.


In [None]:
def recommend_movies_for_user_hybrid_genre(user_id, n_recommendations=5, genres=None):
   
    if user_id not in user_item_matrix.index:
        print("The desired user is not in the datasets.")
        return []

    user_ratings = user_item_matrix.loc[user_id]
    user_rated_movies = user_ratings[user_ratings.notna()].index

    if genres:
        genres = [g.lower() for g in genres]

        genre_filter = movie_genres.apply(lambda row: any(row[genre] == 1 for genre in genres if genre in movie_genres.columns), axis=1)
        candidate_movies = genre_filter[genre_filter].index
    else:
        candidate_movies = user_item_matrix.columns

    predicted_ratings = {}

    for movie_id in candidate_movies:
        if movie_id in user_rated_movies:
            continue

        similar_movies = hybrid_similarity_df[movie_id].drop(movie_id)
        similar_movies = similar_movies[similar_movies.index.isin(user_rated_movies)]

        if similar_movies.empty:
            continue

        weights = similar_movies.values
        ratings = user_ratings[similar_movies.index].values

        predicted_rating = np.dot(weights, ratings) / np.sum(weights)
        predicted_ratings[movie_id] = predicted_rating

    recommended_movie_ids = sorted(predicted_ratings, key=predicted_ratings.get, reverse=True)[:n_recommendations]

    recommended_titles = movies_df[movies_df['item_id'].isin(recommended_movie_ids)][['item_id', 'title']]
    recommended_titles = recommended_titles.set_index('item_id').loc[recommended_movie_ids]

    return recommended_titles

In [18]:
recommend_movies_for_user_hybrid_genre(100, n_recommendations=5)

Unnamed: 0_level_0,title
item_id,Unnamed: 1_level_1
1582,T-Men (1947)
1562,"Eye of Vichy, The (Oeil de Vichy, L') (1993)"
1587,Terror in a Texas Town (1958)
1476,Raw Deal (1948)
1064,Crossfire (1947)


In [19]:
print(movie_genres.loc[:, ['Comedy', 'Romance']].sum())

Comedy     505
Romance    247
dtype: int64


###  Check Genre Filters and User Rated Movies
- Clean and process the input genre list.  
- Print the number of movies available in the selected genres.  
- Filter movies by the selected genres and print the count of candidate movies.  
- For the given user ID, check if the user exists in the dataset and print how many movies the user has rated.  


In [25]:
def check_genre_and_candidates(user_id, genres):
    genres = [g.strip() for g in genres]
    print("Number of films in selected genres:")
    print(movie_genres.loc[:, genres].sum())

    def has_genre(row):
        for genre in genres:
            if genre in movie_genres.columns and row[genre] == 1:
                return True
        return False

    genre_filter = movie_genres.apply(has_genre, axis=1)
    candidate_movies = genre_filter[genre_filter].index
    print(f"Number of candidate films after genre filter: {len(candidate_movies)}")

    if user_id in user_item_matrix.index:
        user_ratings = user_item_matrix.loc[user_id]
        user_rated_movies = user_ratings[user_ratings.notna()].index
        print(f"User {user_id} to {len(user_rated_movies)} The movie has been rated.")
    else:
        print(f"USer {user_id} It is not in the dataset.")

check_genre_and_candidates(100, ['Comedy', 'Romance'])

Number of films in selected genres:
Comedy     505
Romance    247
dtype: int64
Number of candidate films after genre filter: 655
User 100 to 59 The movie has been rated.


In [26]:
user_ratings = user_item_matrix.loc[100]
print(f"Number of videos rated by the user named 100: {user_ratings.notna().sum()}")

Number of videos rated by the user named 100: 59


###  Filter Movies by Specific Genres
Define a function `has_genre` to check if a movie belongs to either 'Comedy' or 'Romance'.  
Apply this filter to the `movie_genres` DataFrame to select movies matching these genres.  
Print the count of movies that meet the genre criteria.


In [27]:
def has_genre(row):
    for genre in ['Comedy', 'Romance']:
        if genre in movie_genres.columns and row[genre] == 1:
            return True
    return False

genre_filter = movie_genres.apply(has_genre, axis=1)
candidate_movies = genre_filter[genre_filter].index
print(f"Number of nominated films with selected genres: {len(candidate_movies)}")

Number of nominated films with selected genres: 655


###  Filter Unseen Candidate Movies for User
- Identify movies that the user has already rated (`user_rated_movies`).  
- Find candidate movies that the user has **not** seen/rated yet (`unseen_candidates`).  
- Print the count of such unseen candidate movies for the given user.


In [None]:
user_rated_movies = user_ratings[user_ratings.notna()].index
unseen_candidates = [movie for movie in candidate_movies if movie not in user_rated_movies]
print(f"Number of candidate movies that user 100 has not seen: {len(unseen_candidates)}")

Number of candidate movies that user 100 has not seen: 635


###  Filter Movies by Selected Genres
- Define a list of target genres (`Comedy`, `Romance`).  
- Create a function `has_genre` to check if a movie belongs to any of the selected genres.  
- Apply the function to the `movie_genres` DataFrame to filter movies of the chosen genres.  
- Display the count and sample titles of movies matching the genre filter.


In [None]:
genres = ['Comedy', 'Romance']

def has_genre(row):
    for genre in genres:
        if genre in movie_genres.columns and row[genre] == 1:
            return True
    return False

genre_filter = movie_genres.apply(has_genre, axis=1)
candidate_movies = genre_filter[genre_filter].index

print("How many movies do we have in the chosen genre?", len(candidate_movies))

print(movies_df[movies_df['item_id'].isin(candidate_movies)][['item_id', 'title']].head(10))

How many movies do we have in the chosen genre? 655
    item_id                               title
0         1                    Toy Story (1995)
3         4                   Get Shorty (1995)
7         8                         Babe (1995)
12       13             Mighty Aphrodite (1995)
13       14                  Postino, Il (1994)
15       16  French Twist (Gazon maudit) (1995)
16       17          From Dusk Till Dawn (1996)
19       20           Angels and Insects (1995)
20       21       Muppet Treasure Island (1996)
24       25                Birdcage, The (1996)


###  Dynamic Hybrid Similarity Calculation
Calculate a personalized hybrid similarity matrix for a user by adjusting the weight `alpha` based on the number of movies the user has rated:  
- If user not in dataset, raise an error.  
- Compute `alpha` as the minimum of `max_alpha` and `(number of rated movies) / 50`.  
- Combine item-based and genre-based similarity matrices weighted by `alpha`.  
- Return the resulting hybrid similarity matrix for personalized recommendations.


In [None]:
def get_dynamic_hybrid_similarity(user_id, max_alpha=0.8):
  
    if user_id not in user_item_matrix.index:
        raise ValueError("The requested user does not exist in the dataset.")

    num_rated = user_item_matrix.loc[user_id].notna().sum()

    alpha = min(max_alpha, num_rated / 50)

    print(f"Number of user points{user_id}: {num_rated} → α = {round(alpha, 3)}")

    hybrid_similarity = alpha * item_similarity_df + (1 - alpha) * genre_similarity_df

    return hybrid_similarity

###  Dynamic Hybrid Recommendation with Personalized Alpha
Recommend movies for a user by dynamically adjusting the weight (`alpha`) between item-based and genre-based similarities:  
- Verify user existence in the dataset.  
- Retrieve user-rated movies.  
- Compute a personalized hybrid similarity matrix using `get_dynamic_hybrid_similarity`.  
- Optionally filter candidate movies by genres.  
- Predict ratings for unrated candidate movies based on weighted average of similar movies rated by the user.  
- Return top `n_recommendations` movie titles sorted by predicted rating.  
- Handle cases with no recommendations gracefully.


In [None]:
def recommend_movies_for_user_dynamic_alpha(user_id, n_recommendations=5, genres=None):
    if user_id not in user_item_matrix.index:
        print("The desired user is not in the dataset.")
        return []

    user_ratings = user_item_matrix.loc[user_id]
    user_rated_movies = user_ratings[user_ratings.notna()].index

    hybrid_similarity_df = get_dynamic_hybrid_similarity(user_id)

    if genres:
        genres = [g.strip() for g in genres]

        def has_genre(row):
            for genre in genres:
                if genre in movie_genres.columns and row[genre] == 1:
                    return True
            return False

        genre_filter = movie_genres.apply(has_genre, axis=1)
        candidate_movies = genre_filter[genre_filter].index.tolist()
    else:
        candidate_movies = user_item_matrix.columns.tolist()

    predicted_ratings = {}

    for movie_id in candidate_movies:
        if movie_id in user_rated_movies:
            continue

        similar_movies = hybrid_similarity_df[movie_id].drop(movie_id)
        similar_movies = similar_movies[similar_movies.index.isin(user_rated_movies)]

        if similar_movies.empty or np.sum(similar_movies.values) == 0:
            continue

        weights = similar_movies.values
        ratings = user_ratings[similar_movies.index].values

        predicted_rating = np.dot(weights, ratings) / np.sum(weights)
        predicted_ratings[movie_id] = predicted_rating

    if len(predicted_ratings) == 0:
        print("There is no movies to suggest.")
        return []

    recommended_movie_ids = sorted(predicted_ratings, key=predicted_ratings.get, reverse=True)[:n_recommendations]
    recommended_titles = movies_df[movies_df['item_id'].isin(recommended_movie_ids)][['item_id', 'title']]
    recommended_titles = recommended_titles.set_index('item_id').loc[recommended_movie_ids]

    return recommended_titles

In [None]:
recommend_movies_for_user_dynamic_alpha(100, n_recommendations=5, genres=['Comedy', 'Romance'])

Number of user points100: 59 → α = 0.8


Unnamed: 0_level_0,title
item_id,Unnamed: 1_level_1
608,Spellbound (1945)
489,Notorious (1946)
485,My Fair Lady (1964)
602,"American in Paris, An (1951)"
498,"African Queen, The (1951)"


###  Dimensionality Reduction with SVD
- Handle missing values in the user-item rating matrix by imputing with the mean.  
- Apply Truncated Singular Value Decomposition (SVD) to reduce dimensionality to 20 components.  
- Extract user feature matrix (`user_features`) and item feature matrix (`item_features`) for further analysis or recommendation tasks.


In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
user_item_filled = imputer.fit_transform(user_item_matrix)

svd = TruncatedSVD(n_components=20, random_state=42)
user_features = svd.fit_transform(user_item_filled)      
item_features = svd.components_.T                        

### Movie Recommendations Using SVD Embeddings
- Get the latent feature vector for the specified user from SVD user features.  
- Predict scores for all movies by dot product with item feature vectors.  
- Exclude movies already rated by the user.  
- Optionally filter recommendations by specified genres.  
- Sort and return top `n_recommendations` movie titles based on predicted scores.


In [None]:
def recommend_svd_movies(user_index, n_recommendations=5, genres=None):
  

    user_vec = user_features[user_index]
    preds = np.dot(item_features, user_vec)  

    user_id = user_item_matrix.index[user_index]
    seen_items = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id].notna()].index

    movie_ids = user_item_matrix.columns

    recommendations = []

    for idx, score in zip(movie_ids, preds):
        if idx in seen_items:
            continue
        recommendations.append((idx, score))

    recommendations.sort(key=lambda x: x[1], reverse=True)

    if genres:
        genres = [g.strip() for g in genres]
        def match_genre(mid):
            for g in genres:
                if movie_genres.loc[mid, g] == 1:
                    return True
            return False
        recommendations = [(mid, score) for mid, score in recommendations if match_genre(mid)]

    top_recs = recommendations[:n_recommendations]

    recommended_titles = movies_df[movies_df['item_id'].isin([r[0] for r in top_recs])][['item_id', 'title']]
    recommended_titles = recommended_titles.set_index('item_id').loc[[r[0] for r in top_recs]]

    return recommended_titles

In [None]:
recommend_svd_movies(user_index=10, n_recommendations=5, genres=['Animation'])

Unnamed: 0_level_0,title
item_id,Unnamed: 1_level_1
408,"Close Shave, A (1995)"
169,"Wrong Trousers, The (1993)"
114,Wallace & Gromit: The Best of Aardman Animatio...
1367,Faust (1994)
189,"Grand Day Out, A (1992)"
