In [None]:
from surprise import Dataset, Reader, SVD, SVDpp, KNNBasic, KNNWithMeans, BaselineOnly
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd

In [11]:
file_data = 'ml-100k/u.data' # file containing the ratings
file_user = 'ml-100k/u.user' # file containing info about the people
file_film = 'ml-100k/u.item' # file containing info about the films
ratings_df = pd.read_csv(file_data, sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python')
user_info = pd.read_csv(file_user, sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip'], engine='python')
film_info = pd.read_csv(file_film, sep='|', names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
                                                   'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film Noir', 'Horror', 'Musical',  'Mystery',
                                                   'Romance',  'SciFi', 'Thriller' , 'War', 'Western'], engine='python', encoding='ISO-8859-1')


# Show the first few rows of the ratings data
print(ratings_df.head())
print(user_info.head())
print(film_info.head())

# Define the Reader object to specify the rating scale (1 to 5)
reader = Reader(rating_scale=(1, 5))

# Load the data into a Surprise dataset
#data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)

# Split the data into train and test sets (80% training, 20% testing)
#trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
   user_id  age gender  occupation    zip
0        1   24      M  technician  85711
1        2   53      F       other  94043
2        3   23      M      writer  32067
3        4   24      M  technician  43537
4        5   33      F       other  15213
   movie_id              title release_date  video_release_date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb_URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?To

In [14]:
# Merge datasets
merged_df = ratings_df.merge(user_info, on='user_id').merge(film_info, left_on='item_id', right_on='movie_id')

# Drop unnecessary columns
merged_df = merged_df.drop(columns=['timestamp', 'video_release_date', 'IMDb_URL', 'zip', 'unknown'])

# Show the first few rows of the fully merged dataset
print(merged_df.head())

# Define the Reader object to specify the rating scale (1 to 5)
reader = Reader(rating_scale=(1, 5))

# Load into Surprise dataset
data = Dataset.load_from_df(merged_df[['user_id', 'item_id', 'rating']], reader)

# Split the data into train and test sets (80% training, 20% testing)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

   user_id  item_id  rating  age gender  occupation  movie_id  \
0      196      242       3   49      M      writer       242   
1      186      302       3   39      F   executive       302   
2       22      377       1   25      M      writer       377   
3      244       51       2   28      M  technician        51   
4      166      346       1   47      M    educator       346   

                        title release_date  Action  ...  Fantasy  Film Noir  \
0                Kolya (1996)  24-Jan-1997       0  ...        0          0   
1    L.A. Confidential (1997)  01-Jan-1997       0  ...        0          1   
2         Heavyweights (1994)  01-Jan-1994       0  ...        0          0   
3  Legends of the Fall (1994)  01-Jan-1994       0  ...        0          0   
4         Jackie Brown (1997)  01-Jan-1997       0  ...        0          0   

   Horror  Musical  Mystery  Romance  SciFi  Thriller  War  Western  
0       0        0        0        0      0         0    0      

In [3]:
# Initialize models
# These are models I found in the Suprise library
models = {
    'SVD': SVD(),
    'SVD++': SVDpp(),
    'KNNBasic': KNNBasic(),
    'KNNWithMeans': KNNWithMeans(),
    'BaselineOnly': BaselineOnly()
}

In [4]:
# Train and evaluate models
results = {}
all_predictions = {}  # Dictionary to store predictions for each model

for name, model in models.items():
    model.fit(trainset)
    predictions = model.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)

    results[name] = {'RMSE': rmse, 'MAE': mae}
    all_predictions[name] = predictions  # Store predictions for this model

# Print stored predictions for a specific model (example: SVD)
print("\nExample Predictions from SVD Model:")
for pred in all_predictions['SVD'][:5]:  # Print first 5 predictions
    print(f"User: {pred.uid}, Item: {pred.iid}, True Rating: {pred.r_ui}, Predicted: {pred.est:.4f}")

RMSE: 0.9347
MAE:  0.7365
RMSE: 0.9201
MAE:  0.7215
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9802
MAE:  0.7727
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9500
MAE:  0.7481
Estimating biases using als...
RMSE: 0.9442
MAE:  0.7490

Example Predictions from SVD Model:
User: 907, Item: 143, True Rating: 5.0, Predicted: 5.0000
User: 371, Item: 210, True Rating: 4.0, Predicted: 4.2818
User: 218, Item: 42, True Rating: 4.0, Predicted: 3.4231
User: 829, Item: 170, True Rating: 4.0, Predicted: 3.9436
User: 733, Item: 277, True Rating: 1.0, Predicted: 3.3363


In [5]:
# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results).T
print("\nModel Performance:\n", results_df)


Model Performance:
                   RMSE       MAE
SVD           0.934665  0.736537
SVD++         0.920050  0.721459
KNNBasic      0.980151  0.772692
KNNWithMeans  0.949996  0.748054
BaselineOnly  0.944180  0.749003


In [6]:
def print_user_prediction_errors(all_predictions, model_name, user_id):
    # Ensure the model exists in all_predictions
    if model_name not in all_predictions:
        print(f"Model '{model_name}' not found in predictions.")
        return

    # Get the predictions for the specified model
    predictions = all_predictions[model_name]
    
    # Ensure user_id is in the correct format
    user_predictions = [pred for pred in predictions if str(pred.uid) == str(user_id)]

    if user_predictions:
        # Get user details
        user_details = user_info[user_info['user_id'] == int(user_id)].iloc[0]
        user_age = user_details['age']
        user_gender = user_details['gender']
        user_occupation = user_details['occupation']
        
        print(f"\nPrediction Errors for User {user_id} ({user_age} years old, {user_gender}, {user_occupation}) using {model_name}:")
        print(f"{'Item':<10} {'Title':<50} {'True Rating':<12} {'Predicted Rating':<18}")
        print("=" * 90)

        abs_errors = []
        squared_errors = []

        for pred in user_predictions:
            movie_title = film_info[film_info['movie_id'] == int(pred.iid)]['title'].values
            movie_title = movie_title[0] if len(movie_title) > 0 else "Unknown"

            abs_error = abs(pred.r_ui - pred.est)
            squared_error = abs_error ** 2

            abs_errors.append(abs_error)
            squared_errors.append(squared_error)

            print(f"{pred.iid:<10} {movie_title:<50} {pred.r_ui:<12} {pred.est:<18.4f}")

        # Compute overall MAE and RMSE for this user
        mae = sum(abs_errors) / len(abs_errors)
        # rmse = (sum(squared_errors) / len(squared_errors)) ** 0.5

        print("\nUser-Specific Performance:")
        print(f"Mean Absolute Error (MAE): {mae:.4f}")
        # print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

    else:
        print(f"\nNo predictions found for User {user_id} using {model_name}.")

In [7]:
model_name = 'SVD++'  # User input for model type
user_id = 10  # Example user ID

print_user_prediction_errors(all_predictions, model_name, user_id)


Prediction Errors for User 10 (53 years old, M, lawyer) using SVD++:
Item       Title                                              True Rating  Predicted Rating  
558        Heavenly Creatures (1994)                          4.0          4.0419            
275        Sense and Sensibility (1995)                       4.0          4.2626            
194        Sting, The (1973)                                  4.0          4.4766            
703        Widows' Peak (1994)                                5.0          3.8233            
340        Boogie Nights (1997)                               4.0          3.8776            
245        Devil's Own, The (1997)                            4.0          3.4717            
488        Sunset Blvd. (1950)                                5.0          4.6205            
144        Die Hard (1988)                                    4.0          4.0173            
175        Brazil (1985)                                      3.0          4.1946   