In [None]:
import numpy as np
import pandas as pd

In [None]:

movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [None]:
movies.head()

In [None]:
ratings.head()

In [None]:
print(movies.shape)

In [None]:
print(ratings.shape)

In [None]:
print("Number of users:", ratings["userId"].nunique())
print("Number of movies rated:",ratings['movieId'].nunique())

In [None]:
all_genres = movies["genres"].str.split("|").explode().unique()

print(all_genres)
print("Number of genres :", len(all_genres))

In [None]:
ratings['rating'].value_counts().sort_index()

In [None]:
ratings_per_user = ratings.groupby("userId")["rating"].count()
ratings_per_user.describe()

In [None]:
ratings_per_movie = ratings.groupby('movieId')["rating"].count()
ratings_per_movie.describe()

In [None]:
all_genres = movies["genres"].str.split("|").explode().unique()
all_genres

In [None]:
for genre in all_genres:
  movies[genre] = movies["genres"].str.contains(genre).astype(int)
movies.head()

In [None]:
avg_movie_rating = ratings.groupby("movieId")["rating"].mean()
movies["avg_rating"] = movies["movieId"].map(avg_movie_rating)
movies[["title", "avg_rating"]].head()

In [None]:
movies["year"] = movies['title'].str.extract(r"\((\d{4})\)").astype(float)

In [None]:
movie_features = movies[["movieId", "year", "avg_rating"] + list(all_genres)].copy()
# Impute NaN values in 'year' and 'avg_rating'
movie_features["year"] = movie_features["year"].fillna(movie_features["year"].median())
movie_features["avg_rating"] = movie_features["avg_rating"].fillna(movie_features["avg_rating"].mean())
movie_features = movie_features.set_index("movieId")
movie_features.head()

In [None]:
movies.columns

In [None]:
movies[["title", "year"]].head()



In [None]:
avg_movie_rating = ratings.groupby("movieId")["rating"].mean()
movies["avg_rating"] = movies["movieId"].map(avg_movie_rating)
movies[["title", "avg_rating"]].head()

In [None]:
movie_features = movies[["movieId", "year", "avg_rating"] + list(all_genres)].copy()
# Impute NaN values in 'year' and 'avg_rating'
movie_features["year"] = movie_features["year"].fillna(movie_features["year"].median())
movie_features["avg_rating"] = movie_features["avg_rating"].fillna(movie_features["avg_rating"].mean())
movie_features = movie_features.set_index("movieId")
movie_features.head()

In [None]:
ratings_with_genre = ratings.merge(
    movies[["movieId"] + list(all_genres)],
    how="left"
)
ratings_with_genre.head()

In [None]:
for genre in all_genres:
  ratings_with_genre[genre] = ratings_with_genre[genre] * ratings_with_genre["rating"]
ratings_with_genre.head()

In [None]:
user_genre_preference = ratings_with_genre.groupby("userId")[list(all_genres)].mean()

In [None]:
user_genre_preference.head()

In [None]:
user_avg_rating = ratings.groupby("userId")["rating"].mean()
user_genre_preference["avg_rating"] = user_avg_rating
user_genre_preference.head()

In [None]:
ratings_with_users = ratings.merge(
    user_genre_preference,
    on="userId",
    how="left"
)
ratings_with_users.head()

In [None]:
ratings_full = ratings_with_users.merge(
    movie_features,
    left_on="movieId",
    right_index=True,
    how="left",
    suffixes=('_user', '_movie') # Added suffixes here
)
ratings_full.head()

In [None]:
y = ratings_full["rating"]

# Define original column names for user and movie features
original_user_cols = user_genre_preference.columns.tolist()
original_movie_cols = movie_features.columns.tolist()

print("Columns in ratings_full:", ratings_full.columns.tolist())

# Select user features with '_user' suffix from ratings_full
user_feature_cols_in_full = [col + '_user' for col in original_user_cols]
X_user = ratings_full[user_feature_cols_in_full]
# Rename columns back to original names for consistency
X_user.columns = original_user_cols

# Select movie features: 'year' (no suffix) and other features with '_movie' suffix
# 'year' is unique to movie_features, so it doesn't get a suffix from the merge
movie_feature_cols_in_full = ['year'] + [col + '_movie' for col in original_movie_cols if col != 'year']
X_movie = ratings_full[movie_feature_cols_in_full]
# Rename columns back to original names for consistency
X_movie.columns = original_movie_cols

print("NaNs in X_user before scaling:\n", X_user.isnull().sum())
print("NaNs in X_movie before scaling:\n", X_movie.isnull().sum())

print(X_user.shape, X_movie.shape, y.shape)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
scaler_user = StandardScaler()
scaler_movie = StandardScaler()

X_user_scaled = scaler_user.fit_transform(X_user)
X_movie_scaled = scaler_movie.fit_transform(X_movie)

In [None]:
scaler_target = MinMaxScaler((0, 1))
y_scaled = scaler_target.fit_transform(y.values.reshape(-1, 1))

In [None]:
print(X_user_scaled.mean(), X_movie_scaled.mean())
print(y_scaled.min(), y_scaled.max())

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
print("User feature shape:", X_user.shape)
print("Movie fetaure shape", X_movie.shape)
print("Target shape:", y.shape)

In [None]:
scaler_user = StandardScaler()
X_user_scaled = scaler_user.fit_transform(X_user)
scaler_movie = StandardScaler()
X_movie_scaled = scaler_movie.fit_transform(X_movie)
scaler_target = MinMaxScaler((0, 1))
y_scaled = scaler_target.fit_transform(y.values.reshape(-1, 1))

In [None]:
X_user_train, X_user_test, \
X_movie_train, X_movie_test, \
y_train, y_test = train_test_split(
    X_user_scaled,
    X_movie_scaled,
    y_scaled,
    train_size = 0.8,
    random_state = 1
)

In [None]:
print("User train shape:", X_user_train.shape)
print("Movie train shape:", X_movie_train.shape)
print("y train shape:", y_train.shape)

print("User test shape:", X_user_test.shape)
print("Movie test shape:", X_movie_test.shape)
print("y test shape:", y_test.shape)

In [None]:
embedding_size = 32
user_nn = keras.Sequential([
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(embedding_size)
])

movie_nn= keras.Sequential([
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(embedding_size)
])

user_input = keras.Input(shape=(X_user_train.shape[1],))
movie_input = keras.Input(shape=(X_movie_train.shape[1],))
user_vec = keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(user_nn(user_input))
movie_vec = keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(movie_nn(movie_input))
output = keras.layers.Dot(axes=1) ([user_vec, movie_vec])
model = keras.Model([user_input, movie_input], output)
model.summary()

In [None]:
tf.random.set_seed(1)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.01),
    loss=keras.losses.MeanSquaredError()
)
history = model.fit(
    [X_user_train, X_movie_train],
    y_train,
    validation_data=([X_user_test, X_movie_test], y_test),
    epochs=30,
    batch_size=256
)

In [None]:
model.evaluate([X_user_test, X_movie_test], y_test)

In [None]:
X_user.columns


In [None]:
X_user.columns.duplicated()

In [None]:
X_user = X_user.loc[:, ~X_user.columns.duplicated()]

In [None]:
X_user.columns

In [None]:
new_user_dict ={
    'Adventure' : 0,
    'Animation' : 3,
    'Children' : 5,
    'Comedy' : 3,
    'Fantasy' : 0,
    'Romance' : 0,
    'Drama' : 5,
    'Action' : 0,
    'Crime': 0,
    'Thriller' : 0,
    'Horror' : 0,
    'Mystery' : 0,
    'Sci-Fi' : 0,
    'War' : 0,
    'Musical' : 0,
    'Documentary' : 0,
    'IMAX' : 0,
    'Western' : 0,
    'Film-Noir': 0,
    '(no genres listed)': 0,
}

ratings = [v for v in new_user_dict.values() if v > 0]
new_user_dict['avg_rating'] = np.mean(ratings) if ratings else 0
new_user_features = pd.DataFrame(
    [new_user_dict],
    columns=X_user.columns
)
new_user_features

In [None]:
new_user_features.shape

In [None]:
new_user_scaled = scaler_user.transform(new_user_features)
new_user_scaled.shape

In [None]:
num_movies = X_movie_scaled.shape[0]
user_matrix = np.repeat(new_user_scaled, num_movies, axis=0)
user_matrix.shape


In [None]:
num_movies = X_movie_scaled.shape[0]
user_matrix = np.repeat(new_user_scaled, num_movies, axis=0)
print("User matrix shape:", user_matrix.shape)
print("Movie matrix shape:", X_movie_scaled.shape)

In [None]:
scores = model.predict([user_matrix, X_movie_scaled])
scores.shape

In [None]:
scaled_unique_movie_features = scaler_movie.transform(movie_features)

num_unique_movies = movie_features.shape[0]
user_matrix_for_recommendation = np.repeat(new_user_scaled, num_unique_movies, axis=0)

scores = model.predict([user_matrix_for_recommendation, scaled_unique_movie_features])

predicted_ratings = scaler_target.inverse_transform(scores)

movie_features_with_predictions = movie_features.copy()
movie_features_with_predictions["predicted_rating"] = predicted_ratings

recommendations = movies.merge(
    movie_features_with_predictions[["predicted_rating"]],
    left_on="movieId",
    right_index=True,
    how="left"
)

top_recommendations = recommendations.sort_values(by="predicted_rating", ascending=False).reset_index(drop=True)

print("Top 10 Movie Recommendations for the New User:")
print(top_recommendations[["title", "predicted_rating"]].head(10))

**Reasoning**:
The first step is to inverse transform the `scores` obtained from the model prediction back to the original rating scale using `scaler_target`. This will give us the actual predicted ratings.



In [None]:
predicted_ratings = scaler_target.inverse_transform(scores)
print("Shape of predicted_ratings:", predicted_ratings.shape)
print("First 5 predicted ratings:", predicted_ratings[:5])

**Reasoning**:
Following the instructions, I will create a copy of the `movie_features` DataFrame, add the `predicted_ratings` as a new column, and then merge it with the `movies` DataFrame to include the movie titles, forming the basis for recommendations.



In [None]:
movie_features_with_predictions = movie_features.copy()
movie_features_with_predictions["predicted_rating"] = predicted_ratings

recommendations = movies.merge(
    movie_features_with_predictions[["predicted_rating"]],
    left_on="movieId",
    right_index=True,
    how="left"
)

print("Shape of movie_features_with_predictions:", movie_features_with_predictions.shape)
print("Head of movie_features_with_predictions:")
print(movie_features_with_predictions.head())
print("\nShape of recommendations:", recommendations.shape)
print("Head of recommendations:")
print(recommendations.head())

In [None]:
scaled_unique_movie_features = scaler_movie.transform(movie_features)

num_unique_movies = movie_features.shape[0]
user_matrix_for_recommendation = np.repeat(new_user_scaled, num_unique_movies, axis=0)

print("Shape of user_matrix_for_recommendation:", user_matrix_for_recommendation.shape)
print("Shape of scaled_unique_movie_features:", scaled_unique_movie_features.shape)

scores = model.predict([user_matrix_for_recommendation, scaled_unique_movie_features])
print("Shape of scores:", scores.shape)

In [None]:
predicted_ratings = scaler_target.inverse_transform(scores)
print("Shape of predicted_ratings:", predicted_ratings.shape)
print("First 5 predicted ratings:", predicted_ratings[:5])

In [None]:
movie_features_with_predictions = movie_features.copy()
movie_features_with_predictions["predicted_rating"] = predicted_ratings

recommendations = movies.merge(
    movie_features_with_predictions[["predicted_rating"]],
    left_on="movieId",
    right_index=True,
    how="left"
)

print("Shape of movie_features_with_predictions:", movie_features_with_predictions.shape)
print("Head of movie_features_with_predictions:")
print(movie_features_with_predictions.head())
print("\nShape of recommendations:", recommendations.shape)
print("Head of recommendations:")
print(recommendations.head())

In [None]:
scaled_unique_movie_features = scaler_movie.transform(movie_features)

num_unique_movies = movie_features.shape[0]
user_matrix_for_recommendation = np.repeat(new_user_scaled, num_unique_movies, axis=0)

scores = model.predict([user_matrix_for_recommendation, scaled_unique_movie_features])

predicted_ratings = scaler_target.inverse_transform(scores)

movie_features_with_predictions = movie_features.copy()
movie_features_with_predictions["predicted_rating"] = predicted_ratings

recommendations = movies.merge(
    movie_features_with_predictions[["predicted_rating"]],
    left_on="movieId",
    right_index=True,
    how="left"
)

top_recommendations = recommendations.sort_values(by="predicted_rating", ascending=False).reset_index(drop=True)

print("Top 10 Movie Recommendations for the New User:")
print(top_recommendations[["title", "predicted_rating"]].head(10))