In [1]:
!pip install tensorflow



In [2]:
import pandas as pd

data_path = #TODO
movies_file = '/movies.csv'
ratings_file = '/ratings.csv'

movies_data = pd.read_csv(data_path+movies_file)
ratings_data = pd.read_csv(data_path+ratings_file)

ratings_data.drop(columns=['timestamp'], inplace = True)

In [3]:
from sklearn.preprocessing import LabelEncoder
merged_data = pd.merge(ratings_data, movies_data, on='movieId', how='left')

# Convert columns from int to string
merged_data['userId'] = merged_data['userId'].astype(str)
merged_data['movieId'] = merged_data['movieId'].astype(str)

# Extract unique genres
unique_genres = set('|'.join(merged_data['genres']).split('|'))

# Create boolean columns for each genre
for genre in unique_genres:
    merged_data[genre] = merged_data['genres'].str.contains(genre).astype(int)

# Drop the original 'genres' column
merged_data.drop(columns=['genres','title'], inplace=True)

# Encode user and movie IDs
user_encoder = LabelEncoder()
merged_data['encoded_user_id'] = user_encoder.fit_transform(merged_data['userId'])

movie_encoder = LabelEncoder()
movie_encoder.fit(movies_data['movieId'])
merged_data['encoded_movie_id'] = movie_encoder.fit_transform(merged_data['movieId'])

merged_data.to_csv('feature_enhanced_data.csv', index=False)

  merged_data[genre] = merged_data['genres'].str.contains(genre).astype(int)


In [4]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets, maintaining the class distribution
train_data, test_data = train_test_split(merged_data, test_size=0.2, stratify= merged_data['userId'], random_state=29)

In [5]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define the neural network model
def create_model(num_users, num_movies, num_features, embedding_size=50):
    user_input = Input(shape=(1,))
    user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size)(user_input)
    user_flat = Flatten()(user_embedding)

    movie_input = Input(shape=(1,))
    movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_size)(movie_input)
    movie_flat = Flatten()(movie_embedding)

    features_input = Input(shape=(num_features,))
    features_dense = Dense(64,
     activation='relu')(features_input)

    concatenated = Concatenate()([user_flat, movie_flat, features_dense])
    dense_1 = Dense(128, activation='relu')(concatenated)
    dense_2 = Dense(64, activation='relu')(dense_1)
    output = Dense(1)(dense_2)

    model = Model(inputs=[user_input, movie_input, features_input], outputs=output)
    return model

# Get the number of unique users, movies, and features
num_users = merged_data['userId'].nunique()
num_movies = merged_data['movieId'].nunique()

num_features = len(merged_data.columns) - 5  # Exclude userId, movieId, rating, and encoded columns

# Create and compile the model
model = create_model(num_users, num_movies, num_features)
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(
    [train_data['encoded_user_id'], train_data['encoded_movie_id'], train_data.iloc[:, 3:-2].values],
    train_data['rating'],
    batch_size=64,
    epochs=10,
    validation_split=0.2,
    callbacks=[EarlyStopping(patience=2)]
)

# Predict ratings for test data
predicted_ratings = model.predict(
    [test_data['encoded_user_id'], test_data['encoded_movie_id'], test_data.iloc[:, 3:-2].values]
).flatten()

predicted_ratings = np.clip(predicted_ratings, 0, 5)

# Calculate mean squared error
mse = mean_squared_error(test_data['rating'], predicted_ratings)
print("Mean Squared Error:", mse)
mae = mean_absolute_error(test_data['rating'], predicted_ratings)
print("Mean Absolute Error:", mae)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Mean Squared Error: 0.7606360202823306
Mean Absolute Error: 0.6751276128960478


In [6]:
# Make predictions for all movies for each user
user_movie_combinations = np.array(np.meshgrid(train_data['encoded_user_id'].unique(), train_data['encoded_movie_id'].unique())).T.reshape(-1,2)
user_ids = user_movie_combinations[:, 0]
movie_ids = user_movie_combinations[:, 1]

# Predict ratings
predicted_ratings = model.predict([user_ids, movie_ids, np.zeros((len(user_ids), num_features))]).flatten()

# Clip predicted ratings to be within the range of 0 to 5
predicted_ratings = np.clip(predicted_ratings, 0, 5)

# Decode user and movie IDs
original_user_ids = user_encoder.inverse_transform(user_ids)
original_movie_ids = movie_encoder.inverse_transform(movie_ids)

# Create a DataFrame to store the predictions
predictions_df = pd.DataFrame({
    'userId': original_user_ids,
    'movieId': original_movie_ids,
    'predicted_rating': predicted_ratings
})



In [7]:
# Group train_data by user ID and aggregate movie IDs into a list
seen_movies_df = train_data.groupby('userId')['movieId'].agg(list).reset_index()
seen_movies_df.columns = ['userId', 'seen_movies']

predictions_df = predictions_df.groupby('userId').apply(lambda group: group.sort_values(by='predicted_rating', ascending=False))

# Define a function to filter out rows with movie IDs in the seen movies list
def filter_seen_movies(group):
    user_id = list(set(group['userId']))[0]
    filtered_group = group[~group['movieId'].isin(seen_movies_df[seen_movies_df['userId'] == 'user_id']['seen_movies'])]
    return filtered_group.nlargest(5, 'predicted_rating')

# Group predictions_df by user ID, sort predictions within each group by predicted rating, and filter out seen movies
predictions_df = predictions_df.reset_index(drop=True).groupby('userId').apply(filter_seen_movies)

# Reset index after filtering
predictions_df.reset_index(drop=True, inplace=True)

movies_data['movieId'] = movies_data['movieId'].astype(str)
movie_recommendations_df = pd.merge(predictions_df, movies_data, on='movieId', how='inner')

existing_test_ratings = test_data[['userId','movieId','rating']]

movie_recommendations_df = pd.merge(movie_recommendations_df, existing_test_ratings, on=['userId', 'movieId'], how='left')

# Group predictions_df by user ID and aggregate titles and predicted ratings into lists
movie_recommendations_df = movie_recommendations_df.groupby('userId').agg({
    'title': lambda x: list(x),
    'predicted_rating': lambda x: list(x)
})

In [10]:
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD
import numpy as np

# Create a sparse matrix of user-item ratings
user_item_matrix = coo_matrix((train_data['rating'], (train_data['userId'].astype(int), train_data['movieId'].astype(int)))).tocsr()

# Perform Singular Value Decomposition (SVD)
svd = TruncatedSVD(n_components=2)  # You can adjust the number of components
U, Sigma, Vt = svd.fit_transform(user_item_matrix), np.diag(svd.singular_values_), svd.components_

# Predict ratings for test data
predicted_ratings = np.dot(np.dot(U, Sigma), Vt)

predicted_ratings = np.clip(predicted_ratings, 0, 5)
max_train_movie_id = max(train_data['movieId'])
predicted_ratings_adjusted = []

for index, row in test_data.iterrows():
    if row['movieId'] < max_train_movie_id:
        predicted_rating = predicted_ratings[int(row['userId']), int(row['movieId'])]
    else:
        predicted_rating = 0
    predicted_ratings_adjusted.append(predicted_rating)

# Calculate mean squared error
mse = mean_squared_error(test_data['rating'], predicted_ratings_adjusted)

print("Mean Squared Error:", mse)

mae = mean_absolute_error(test_data['rating'], predicted_ratings_adjusted)
print("Mean Absolute Error:", mae)

Mean Squared Error: 3.6333589479005948
Mean Absolute Error: 1.557864395630317


In [49]:
import random
def get_rec_for_random_user():
    random_user_id = random.choice(list(movie_recommendations_df.index))
    return movie_recommendations_df[movie_recommendations_df.index == random_user_id]

In [51]:
get_rec_for_random_user()

Unnamed: 0_level_0,title,predicted_rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
259,"[Heart and Souls (1993), Fantastic Fear of Eve...","[4.345799922943115, 4.423501014709473, 4.41960..."


In [52]:
movie_recommendations_df.to_csv('movie_recommendations.csv')

In [53]:
movie_recommendation_file = 'movie_recommendations.csv'
movie_recommendations_df = pd.read_csv(movie_recommendation_file)

In [56]:
result_df = movie_recommendations_df[movie_recommendations_df['userId'] == 1]
result_df

Unnamed: 0,userId,title,predicted_rating
0,1,"['Dune (2000)', 'Eye for an Eye (1996)', 'On t...","[4.622557640075684, 4.608473300933838, 4.55400..."


In [62]:
result_df['title'].iloc[0]

"['Dune (2000)', 'Eye for an Eye (1996)', 'On the Trail of the Bremen Town Musicians (1973)', 'Gladiator (1992)', 'Dogville (2003)']"

In [54]:
movie_recommendations_df

Unnamed: 0,userId,title,predicted_rating
0,1,"['Dune (2000)', 'Eye for an Eye (1996)', 'On t...","[4.622557640075684, 4.608473300933838, 4.55400..."
1,10,"['We Were Soldiers (2002)', 'What Lies Beneath...","[4.638012886047363, 4.344286918640137, 4.32411..."
2,100,['Children of the Corn IV: The Gathering (1996...,"[4.717223644256592, 4.6698994636535645, 4.6479..."
3,101,"['Dune (2000)', 'Funny Games U.S. (2007)', 'Au...","[4.2800750732421875, 4.454801082611084, 4.2149..."
4,102,"['Dune (2000)', ""Girl Who Kicked the Hornet's ...","[4.083487510681152, 4.062256813049316, 4.01960..."
...,...,...,...
605,95,"['Dune (2000)', ""Girl Who Kicked the Hornet's ...","[4.565107822418213, 4.534331798553467, 4.42782..."
606,96,"['Eye for an Eye (1996)', ""Girl Who Kicked the...","[4.2900590896606445, 4.277102470397949, 4.3178..."
607,97,['Flickering Lights (Blinkende lygter) (2000)'...,"[5.0, 5.0, 5.0, 5.0, 5.0]"
608,98,"[""A Dog's Purpose (2017)"", 'Willy/Milly (1986)...","[4.438066005706787, 4.376927375793457, 4.36175..."


In [None]:
predicted_ratings = model.predict(
    [test_data['encoded_user_id'], test_data['encoded_movie_id'], test_data.iloc[:, 3:-2].values]
).flatten()