In [14]:
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import GridSearchCV, train_test_split

In [15]:
# Load the dataset for training and validation
reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5), skip_lines=1)
data = Dataset.load_from_file('data_movie_lens_100k/ratings_all_development_set.csv', reader=reader)

# Split the dataset into train and test sets
train_set, test_set = train_test_split(data, test_size=0.2)

In [16]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_factors': [20, 50, 100],
    'n_epochs': [10, 20, 30],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.05, 0.1]
}

# Perform grid search with RMSE as the optimization metric
grid_search = GridSearchCV(
    SVD,
    param_grid,
    measures=['rmse'],
    cv=3,  # Cross-validation within train_set
    n_jobs=-1  # Use all available CPUs
)
grid_search.fit(data)

# Get the best parameters
best_params = grid_search.best_params['rmse']
print("Best Parameters:", best_params)

Best Parameters: {'n_factors': 100, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}


In [17]:
# Retrain the best model using ALL the available data
full_train_set = data.build_full_trainset()
best_model = SVD(
    n_factors=best_params['n_factors'],
    n_epochs=best_params['n_epochs'],
    lr_all=best_params['lr_all'],
    reg_all=best_params['reg_all']
)
best_model.fit(full_train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x14f3a77f0>

In [18]:
# Load the leaderboard dataset (without ratings)
leaderboard_df = pd.read_csv('data_movie_lens_100k/ratings_masked_leaderboard_set.csv')

# Generate predictions for the leaderboard set
predictions = []
for _, row in leaderboard_df.iterrows():
    user_id = row['user_id']
    item_id = row['item_id']
    
    # Check if the user_id and item_id are in the training set
    if user_id not in best_model.trainset._raw2inner_id_users or \
       item_id not in best_model.trainset._raw2inner_id_items:
        # Use global mean if user or item is unseen
        predicted_rating = best_model.trainset.global_mean
    else:
        # Predict the rating for the user-item pair
        predicted_rating = best_model.predict(user_id, item_id).est
    predictions.append(predicted_rating)

# Save predictions to a plain text file
predictions = pd.Series(predictions)  # Convert to a pandas Series for saving
predictions.to_csv('predicted_ratings_leaderboard.txt', index=False, header=False)

print("Predictions saved to predicted_ratings_leaderboard.txt")

Predictions saved to predicted_ratings_leaderboard.txt
