In [72]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [73]:
STAGE_2_PATH = "rec_merged_2.csv"
STAGE_3_PATH = "rec_merged_3.csv"

df_stage2: pd.DataFrame = pd.read_csv(STAGE_2_PATH)
df_stage2['rec_type'].replace(['Rand'], 'Random', inplace=True)

df_stage3: pd.DataFrame = pd.read_csv(STAGE_3_PATH).drop(columns=["preference"])

# For the combined df, drop stage 3's extra column.
df = pd.concat([df_stage2, df_stage3], ignore_index=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_stage2['rec_type'].replace(['Rand'], 'Random', inplace=True)


Unnamed: 0,i_id_c,popularity,loudness,danceability,energy,key,speechiness,acousticness,instrumentalness,valence,...,emo_post_arousal,rec_type,time,weather type,pressure,temperature,humidity,gps_speed,arousal_change,valence_change
0,874.0,22.0,-12.891,0.591,0.340,2.0,0.0298,0.600000,0.000027,0.503,...,-0.347062,GCN,3,2,1003,24,95,0.000000,0.056438,0.115240
1,874.0,22.0,-12.891,0.591,0.340,2.0,0.0298,0.600000,0.000027,0.503,...,-0.206316,GCN,2,0,994,30,73,0.097190,0.046184,0.062349
2,874.0,22.0,-12.891,0.591,0.340,2.0,0.0298,0.600000,0.000027,0.503,...,-0.233082,GCN,3,1,999,24,99,0.000000,0.079418,-0.094844
3,874.0,22.0,-12.891,0.591,0.340,2.0,0.0298,0.600000,0.000027,0.503,...,-0.335536,GCN,2,1,999,28,76,1.117141,-0.120286,-0.112771
4,874.0,22.0,-12.891,0.591,0.340,2.0,0.0298,0.600000,0.000027,0.503,...,0.133190,GCN,3,2,994,22,97,0.000000,-0.020560,0.053698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1241,562.0,46.0,-5.333,0.670,0.704,6.0,0.0296,0.073400,0.000000,0.542,...,-0.144806,Random,1,0,998,22,42,0.000000,0.339694,0.220053
1242,342.0,42.0,-4.122,0.561,0.959,10.0,0.0504,0.037500,0.907000,0.719,...,0.192147,Random,3,0,1007,18,42,0.000000,-0.036103,-0.030608
1243,245.0,45.0,-15.978,0.469,0.363,0.0,0.0443,0.564000,0.000001,0.451,...,0.203286,Random,2,0,1019,18,21,0.000000,-0.325714,-0.027726
1244,814.0,67.0,-5.903,0.320,0.917,9.0,0.0771,0.071700,0.000000,0.715,...,0.144806,Random,3,0,1019,18,21,0.000000,0.537556,0.253509


In [74]:
df.columns

# Use genre, singer (subset of genre --> choose exclusively), prearousal, prevalence, as user characteristics
# item charactersitcs

# --> predict VA change, or rating


Index(['i_id_c', 'popularity', 'loudness', 'danceability', 'energy', 'key',
       'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo',
       'general_genre_id', 'duration_x', 'F0final_sma_amean',
       'F0final_sma_stddev', 'audspec_lengthL1norm_sma_stddev',
       'pcm_RMSenergy_sma_stddev', 'pcm_fftMag_psySharpness_sma_amean',
       'pcm_fftMag_psySharpness_sma_stddev', 'pcm_zcr_sma_amean',
       'pcm_zcr_sma_stddev', 'singer', 'user_id', 'rating', 'emo_pre_valence',
       'emo_pre_arousal', 'emo_post_valence', 'emo_post_arousal', 'rec_type',
       'time', 'weather type', 'pressure', 'temperature', 'humidity',
       'gps_speed', 'arousal_change', 'valence_change'],
      dtype='object')

In [75]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()


scale_columns = [
    # Music high level metadata
    'popularity', 'loudness', 'danceability', 'energy',
    'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo', 

    # Music low level metadata
    'F0final_sma_amean','F0final_sma_stddev', 
    'audspec_lengthL1norm_sma_stddev',
    'pcm_RMSenergy_sma_stddev', 
    'pcm_fftMag_psySharpness_sma_amean', 'pcm_fftMag_psySharpness_sma_stddev', 
    'pcm_zcr_sma_amean', 'pcm_zcr_sma_stddev', 

    # Situational
    'pressure', 'temperature', 'humidity','gps_speed', 

    # # Categorical discrete
    # 'key', 'general_genre_id', 'time', 'weather type' 
]

df_scaled = pd.DataFrame(scaler.fit_transform(df[scale_columns]), columns=scale_columns, index=df.index)
df_scaled


Unnamed: 0,popularity,loudness,danceability,energy,speechiness,acousticness,instrumentalness,valence,tempo,F0final_sma_amean,...,audspec_lengthL1norm_sma_stddev,pcm_RMSenergy_sma_stddev,pcm_fftMag_psySharpness_sma_amean,pcm_fftMag_psySharpness_sma_stddev,pcm_zcr_sma_amean,pcm_zcr_sma_stddev,pressure,temperature,humidity,gps_speed
0,-0.942026,-0.664061,0.242600,-0.975178,-0.478035,0.786774,-0.577333,-0.142212,-0.043256,-0.810732,...,-0.190569,-0.142376,-0.831447,1.507606,-0.629329,0.969211,-0.322941,0.590064,1.579504,-0.331051
1,-0.942026,-0.664061,0.242600,-0.975178,-0.478035,0.786774,-0.577333,-0.142212,-0.043256,-0.810732,...,-0.190569,-0.142376,-0.831447,1.507606,-0.629329,0.969211,-0.842072,1.286844,0.789608,-0.294990
2,-0.942026,-0.664061,0.242600,-0.975178,-0.478035,0.786774,-0.577333,-0.142212,-0.043256,-0.810732,...,-0.190569,-0.142376,-0.831447,1.507606,-0.629329,0.969211,-0.553666,0.590064,1.723122,-0.331051
3,-0.942026,-0.664061,0.242600,-0.975178,-0.478035,0.786774,-0.577333,-0.142212,-0.043256,-0.810732,...,-0.190569,-0.142376,-0.831447,1.507606,-0.629329,0.969211,-0.553666,1.054584,0.897321,0.083451
4,-0.942026,-0.664061,0.242600,-0.975178,-0.478035,0.786774,-0.577333,-0.142212,-0.043256,-0.810732,...,-0.190569,-0.142376,-0.831447,1.507606,-0.629329,0.969211,-0.842072,0.357804,1.651313,-0.331051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1241,0.236354,0.613502,0.654229,0.388547,-0.480501,-0.754366,-0.577415,-0.014447,0.117516,-0.330045,...,0.260916,-0.124383,-0.005370,-0.563712,0.338234,0.806052,-0.611347,0.357804,-0.323427,-0.331051
1242,0.039957,0.818203,0.086285,1.343905,-0.224035,-0.859430,2.183968,0.565407,-0.105665,1.813560,...,0.721423,0.342771,1.714463,1.228522,1.171185,1.041186,-0.092216,-0.106717,-0.323427,-0.331051
1243,0.187255,-1.185870,-0.393081,-0.889009,-0.299249,0.681417,-0.577411,-0.312565,-1.539121,0.648993,...,-1.114551,0.268201,0.809908,1.352277,0.582386,0.005607,0.599959,-0.106717,-1.077419,-0.331051
1244,1.267436,0.517152,-1.169445,1.186552,0.105178,-0.759341,-0.577415,0.552303,1.963835,-0.407979,...,-0.302817,-0.730093,0.493053,0.556548,0.478095,0.853695,0.599959,-0.106717,-1.077419,-0.331051


In [76]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

cat_columns = [
    'singer'
]

encoder = LabelEncoder()
arr = encoder.fit_transform(df[cat_columns])

df_singers_label = pd.DataFrame(arr, columns=cat_columns, index=df.index)
df_singers_label


onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_arr = onehot_encoder.fit_transform(df[cat_columns])

df_onehot = pd.DataFrame(data=onehot_arr, columns=onehot_encoder.get_feature_names_out())
df_onehot

  y = column_or_1d(y, warn=True)


Unnamed: 0,singer_23 Skidoo,singer_A Tribe Called Quest,singer_Aereogramme,singer_Aerosmith,singer_Alice Deejay,singer_Alice In Chains,singer_Alicia Keys,singer_Amos Lee,singer_Amy Winehouse,singer_Andrew Bird,...,singer_Various Artists,singer_Wang Chung,singer_Washed Out,singer_Wax Tailor,singer_Weeping Tile,singer_Weezer,singer_Westside Connection,singer_Wolfmother,singer_Xtreme,singer_Yael Naïm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
# # Concatenate the scaled and encoded dataframes back together
# df_transformed = pd.concat([df_scaled, df_singers_encoded], axis=1)
df_transformed = pd.concat([df_scaled, df_onehot], axis=1)
df_not_transformed = df.drop(columns=df_transformed.columns, errors="ignore") 

In [78]:
df = pd.concat([df_transformed, df_not_transformed], axis=1)
df

Unnamed: 0,popularity,loudness,danceability,energy,speechiness,acousticness,instrumentalness,valence,tempo,F0final_sma_amean,...,rating,emo_pre_valence,emo_pre_arousal,emo_post_valence,emo_post_arousal,rec_type,time,weather type,arousal_change,valence_change
0,-0.942026,-0.664061,0.242600,-0.975178,-0.478035,0.786774,-0.577333,-0.142212,-0.043256,-0.810732,...,3,0.32275,-0.40350,0.437990,-0.347062,GCN,3,2,0.056438,0.115240
1,-0.942026,-0.664061,0.242600,-0.975178,-0.478035,0.786774,-0.577333,-0.142212,-0.043256,-0.810732,...,3,0.20275,-0.25250,0.265099,-0.206316,GCN,2,0,0.046184,0.062349
2,-0.942026,-0.664061,0.242600,-0.975178,-0.478035,0.786774,-0.577333,-0.142212,-0.043256,-0.810732,...,3,0.16400,-0.31250,0.069156,-0.233082,GCN,3,1,0.079418,-0.094844
3,-0.942026,-0.664061,0.242600,-0.975178,-0.478035,0.786774,-0.577333,-0.142212,-0.043256,-0.810732,...,3,0.43550,-0.21525,0.322729,-0.335536,GCN,2,1,-0.120286,-0.112771
4,-0.942026,-0.664061,0.242600,-0.975178,-0.478035,0.786774,-0.577333,-0.142212,-0.043256,-0.810732,...,4,0.47650,0.15375,0.530198,0.133190,GCN,3,2,-0.020560,0.053698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1241,0.236354,0.613502,0.654229,0.388547,-0.480501,-0.754366,-0.577415,-0.014447,0.117516,-0.330045,...,4,0.44550,-0.48450,0.665553,-0.144806,Random,1,0,0.339694,0.220053
1242,0.039957,0.818203,0.086285,1.343905,-0.224035,-0.859430,2.183968,0.565407,-0.105665,1.813560,...,3,0.54300,0.22825,0.512392,0.192147,Random,3,0,-0.036103,-0.030608
1243,0.187255,-1.185870,-0.393081,-0.889009,-0.299249,0.681417,-0.577411,-0.312565,-1.539121,0.648993,...,4,0.57075,0.52900,0.543024,0.203286,Random,2,0,-0.325714,-0.027726
1244,1.267436,0.517152,-1.169445,1.186552,0.105178,-0.759341,-0.577415,0.552303,1.963835,-0.407979,...,4,0.02775,-0.39275,0.281259,0.144806,Random,3,0,0.537556,0.253509


In [79]:
targets = ["rating", "valence_change", "arousal_change"]

dropped = [
    'emo_pre_valence', 
    'emo_pre_arousal', 
    'emo_post_valence', 
    'emo_post_arousal', 
    'rec_type',
    'duration_x'
    ]

cont_features = [
    'popularity', 
    'loudness', 
    'danceability', 
    'energy', 
    'speechiness', 
    'acousticness', 
    'instrumentalness', 
    'valence', 
    'tempo',
    'F0final_sma_amean', 'F0final_sma_stddev', 
    'audspec_lengthL1norm_sma_stddev',
    'pcm_RMSenergy_sma_stddev', 
    'pcm_fftMag_psySharpness_sma_amean', 'pcm_fftMag_psySharpness_sma_stddev', 
    'pcm_zcr_sma_amean', 'pcm_zcr_sma_stddev', 
    'pressure', 'temperature', 'humidity', 'gps_speed' # Situational
]

cat_features = [
    'key',
    'general_genre_id',
    'time', 'weather type', # Situational
]

features = cont_features + cat_features

all_singers_columns = list(df_onehot.columns)



In [80]:
df = df.drop(columns=dropped)

df_feat = df[features]
df_feat_with_singers = df[all_singers_columns + features]

df_target = df[targets]

## Collaborative Filtering

Attempt Collaborative Filtering using Surprise. However, this only uses the ids (user, songs, ratings) instead of the other features (as per the nature of collaborative filtering).

In [81]:
from surprise import Dataset
from surprise import Reader

from surprise import NMF
from surprise import SVD
from surprise import SVDpp
from surprise import SlopeOne
from surprise import KNNBasic
from surprise import KNNBaseline
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import CoClustering


from surprise.model_selection import train_test_split, cross_validate

In [82]:
reader = Reader()

# User ids, item ids, ratings ***IN THAT ORDER***
id_cols = [
    'user_id', 
    'i_id_c',
    'rating', 
]

# User ids, item ids, ratings ***IN THAT ORDER***
data = Dataset.load_from_df(df[id_cols], reader)
trainset, testset = train_test_split(data, test_size=0.25)

In [83]:
sim_options = {'name': 'cosine',
               'user_based': True 
               }
model = KNNBasic(sim_options=sim_options)
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9242  0.9991  1.0051  0.9621  0.9272  0.9635  0.0342  
MAE (testset)     0.7491  0.8013  0.8159  0.7915  0.7682  0.7852  0.0238  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.92418626, 0.99905892, 1.00509232, 0.96207442, 0.92718918]),
 'test_mae': array([0.74912592, 0.8013448 , 0.81590223, 0.79150162, 0.76822522]),
 'fit_time': (0.00150299072265625,
  0.0,
  0.0010001659393310547,
  0.0012652873992919922,
  0.0010094642639160156),
 'test_time': (0.0030052661895751953,
  0.0029997825622558594,
  0.0019981861114501953,
  0.002056121826171875,
  0.0021314620971679688)}

In [84]:
model = SVDpp()
model.fit(trainset)
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9569  0.9011  0.8533  0.9214  0.9221  0.9110  0.0339  
MAE (testset)     0.7690  0.7344  0.6815  0.7490  0.7461  0.7360  0.0294  
Fit time          0.06    0.07    0.06    0.07    0.08    0.07    0.01    
Test time         0.02    0.01    0.02    0.02    0.02    0.02    0.00    


{'test_rmse': array([0.95686559, 0.90109881, 0.85333209, 0.92135818, 0.92211734]),
 'test_mae': array([0.76898989, 0.73443136, 0.68146663, 0.7489969 , 0.7461024 ]),
 'fit_time': (0.06338620185852051,
  0.06840825080871582,
  0.05908846855163574,
  0.07048273086547852,
  0.07534456253051758),
 'test_time': (0.02002859115600586,
  0.011655330657958984,
  0.021044015884399414,
  0.019371747970581055,
  0.01793646812438965)}

Try out predictions

In [85]:
from surprise import accuracy
predictions = model.test(testset, verbose=True)

accuracy.rmse(predictions)
accuracy.mae(predictions)

user: 21         item: 916.0      r_ui = 4.00   est = 4.27   {'was_impossible': False}
user: 15         item: 875.0      r_ui = 3.00   est = 3.43   {'was_impossible': False}
user: 19         item: 923.0      r_ui = 4.00   est = 3.60   {'was_impossible': False}
user: 18         item: 757.0      r_ui = 5.00   est = 4.09   {'was_impossible': False}
user: 15         item: 591.0      r_ui = 3.00   est = 2.76   {'was_impossible': False}
user: 15         item: 731.0      r_ui = 3.00   est = 3.06   {'was_impossible': False}
user: 30         item: 739.0      r_ui = 2.00   est = 2.90   {'was_impossible': False}
user: 18         item: 429.0      r_ui = 3.00   est = 2.90   {'was_impossible': False}
user: 26         item: 848.0      r_ui = 2.00   est = 2.97   {'was_impossible': False}
user: 26         item: 266.0      r_ui = 4.00   est = 3.74   {'was_impossible': False}
user: 6          item: 517.0      r_ui = 3.00   est = 3.70   {'was_impossible': False}
user: 25         item: 120.0      r_ui = 4.

0.5476949899781101

Benchmarking each model imported

In [86]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    algorithm.fit(trainset)
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=5, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    # <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x000001B9EBDDA290> --> SVDpp
    tmp = pd.concat([tmp, pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])])
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing t

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BaselineOnly,0.904513,0.742589,0.001081,0.000203
SVD,0.908803,0.740334,0.007618,0.001319
SVDpp,0.912324,0.73409,0.067782,0.014748
KNNWithMeans,0.934673,0.749935,0.00083,0.002216
KNNBaseline,0.937041,0.754213,0.002,0.006462
SlopeOne,0.949798,0.760643,0.001725,0.003973
KNNWithZScore,0.957276,0.76689,0.001855,0.003224
CoClustering,0.957595,0.769727,0.030169,0.000801
KNNBasic,0.963753,0.785171,0.000621,0.002807
NMF,1.014197,0.800091,0.021659,0.001632


## Collaborative Filtering with Valence Arousal change
Project V-A change over the y=x line to reduce the dimension to 1, then use the resulting value as the rating for the collaborative filtering model.

In [150]:
# Testing projection
df_va_d = df[['valence_change', 'arousal_change']]

d = np.array([1, 1])  # Direction vector for line y = x

project_point_onto_d = lambda v: (np.dot(v, d) / np.linalg.norm(d)**2 )
df_va_d.apply(project_point_onto_d, axis=1)


0       0.085839
1       0.054266
2      -0.007713
3      -0.116528
4       0.016569
          ...   
1241    0.279873
1242   -0.033355
1243   -0.176720
1244    0.395533
1245   -0.064059
Length: 1246, dtype: float64

In [151]:
from surprise import Dataset
from surprise import Reader
from surprise import SVDpp
from surprise import accuracy
from surprise.model_selection import train_test_split, cross_validate



class CollaborativeFilter:
    id_cols = [
        'user_id', 
        'i_id_c',
    ]
    
    project_point_onto_d = lambda v: (np.dot(v, d) / np.linalg.norm(d)**2 )

    def __init__(self, df: pd.DataFrame, model):
        data_df = df[self.id_cols]
        data_df['va_projected'] = df[['valence_change', 'arousal_change']].apply(project_point_onto_d, axis=1)
        self.model = model

        reader = Reader(rating_scale=(-1,1))

        # User ids, item ids, ratings ***IN THAT ORDER***
        self.dataset = Dataset.load_from_df(data_df[self.id_cols+['va_projected']], reader)
        self.trainset, self.testset = train_test_split(self.dataset, test_size=0.2)
        self.model.fit(self.trainset)

    def cross_validate(self):
        results = cross_validate(self.model, self.dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)
        mean = pd.DataFrame.from_dict(results).mean(axis=0)
        print(mean)
    
    def test(self):
        predictions = self.model.test(self.testset, verbose=True)

        accuracy.rmse(predictions)
        accuracy.mae(predictions)

    def predict(self, user_id, i_id_c, real_rating=None):
        pred = self.model.predict(user_id, i_id_c, r_ui=real_rating, verbose=True)
        return pred


In [148]:
collab_filter = CollaborativeFilter(df, SVDpp())
collab_filter.cross_validate()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df['va_projected'] = df[['valence_change', 'arousal_change']].apply(project_point_onto_d, axis=1)


Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2047  0.2115  0.1911  0.2189  0.2013  0.2055  0.0094  
MAE (testset)     0.1537  0.1620  0.1403  0.1588  0.1454  0.1520  0.0081  
Fit time          0.07    0.06    0.06    0.06    0.06    0.06    0.01    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    
test_rmse    0.205509
test_mae     0.152022
fit_time     0.061664
test_time    0.011137
dtype: float64


In [137]:
collab_filter.test()

user: 18         item: 19.0       r_ui = 0.01   est = 0.09   {'was_impossible': False}
user: 29         item: 132.0      r_ui = 0.35   est = 0.10   {'was_impossible': False}
user: 11         item: 629.0      r_ui = 0.05   est = -0.03   {'was_impossible': False}
user: 18         item: 591.0      r_ui = 0.10   est = 0.00   {'was_impossible': False}
user: 13         item: 757.0      r_ui = 0.17   est = 0.04   {'was_impossible': False}
user: 23         item: 332.0      r_ui = 0.10   est = 0.01   {'was_impossible': False}
user: 13         item: 434.0      r_ui = 0.03   est = 0.10   {'was_impossible': False}
user: 26         item: 17.0       r_ui = 0.12   est = -0.01   {'was_impossible': False}
user: 6          item: 863.0      r_ui = 0.20   est = 0.15   {'was_impossible': False}
user: 4          item: 517.0      r_ui = 0.08   est = -0.02   {'was_impossible': False}
user: 18         item: 273.0      r_ui = 0.10   est = 0.02   {'was_impossible': False}
user: 18         item: 375.0      r_ui =

In [149]:
collab_filter.predict(1, 2)

user: 1          item: 2          r_ui = None   est = 0.02   {'was_impossible': False}


Prediction(uid=1, iid=2, r_ui=None, est=0.023079368564635366, details={'was_impossible': False})

## Content-based Filtering

Reference from https://thecleverprogrammer.com/2023/06/05/hybrid-recommendation-system-using-python/

nvm this tutorial is for data with words lol

In [88]:
from surprise import Dataset, Reader, SVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

Prepare dataframes

In [89]:
reader = Reader()

# User ids, item ids, ratings ***IN THAT ORDER***
data = Dataset.load_from_df(df[id_cols], reader)

### Linear kernel similarity

Simple feature similarity metric

In [90]:
from sklearn.metrics.pairwise import linear_kernel

In [91]:
content_similarity = linear_kernel(df_feat, df_feat)
content_similarity

array([[ 29.94888217,  22.26809323,  28.25023713, ...,   6.2006726 ,
         24.0226772 ,  26.0763401 ],
       [ 22.26809323,  20.96754071,  22.57578183, ...,   4.65396833,
         21.47597293,  24.08552561],
       [ 28.25023713,  22.57578183,  27.62545209, ...,   5.9075108 ,
         23.72951541,  25.86818584],
       ...,
       [  6.2006726 ,   4.65396833,   5.9075108 , ..., 160.86051167,
         75.8426994 ,  35.09212599],
       [ 24.0226772 ,  21.47597293,  23.72951541, ...,  75.8426994 ,
        140.25373814, 117.636964  ],
       [ 26.0763401 ,  24.08552561,  25.86818584, ...,  35.09212599,
        117.636964  , 123.95477018]])

### NN

In [92]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [93]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NN(nn.Module):
    def __init__(self, embedding_size, n_continuous, n_classes):
        super(NN, self).__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(int(num_embeddings), embedding_size) for num_embeddings in n_classes])
        self.linear = nn.Linear(n_continuous + len(n_classes) * embedding_size, 1)
        self.deep = nn.Sequential(
            nn.Linear(n_continuous + len(n_classes) * embedding_size, 256),
            nn.ReLU(),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x_categorical, x_continuous):
        embeddings = [embedding(x_categorical[:,i]) for i, embedding in enumerate(self.embeddings)]
        x = torch.cat(embeddings + [x_continuous], 1)
        deep = self.deep(x)
        return deep


In [94]:
from torch.utils.data import Dataset, DataLoader

class SongDataset(Dataset):
    def __init__(self, X_cat, X_cont, y):
        self.X_cat = X_cat
        self.X_cont = X_cont
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return [self.X_cat[idx], self.X_cont[idx], self.y[idx]]

# Prepare datasets
X_cat_train = torch.tensor(train[cat_features].values, dtype=torch.int64)
X_cont_train = torch.tensor(train[cont_features].values, dtype=torch.float32)
y_train = torch.tensor(train['rating'].values, dtype=torch.float32)

train_dataset = SongDataset(X_cat_train, X_cont_train, y_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)

# Model, loss, and optimizer
model = NN(embedding_size=10, n_continuous=len(cont_features), n_classes=[df[col].max() + 1 for col in cat_features])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    for X_cat, X_cont, y in train_loader:
        optimizer.zero_grad()
        outputs = model(X_cat, X_cont).squeeze(1)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()


In [95]:
X_cat_test = torch.tensor(test[cat_features].values, dtype=torch.int64)
X_cont_test = torch.tensor(test[cont_features].values, dtype=torch.float32)
y_test = torch.tensor(test['rating'].values, dtype=torch.float32)

test_dataset = SongDataset(X_cat_test, X_cont_test, y_test)
test_loader = DataLoader(dataset=test_dataset, batch_size=10, shuffle=False)


In [96]:
def evaluate(model, data_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # No gradients needed
        for X_cat, X_cont, y in data_loader:
            outputs = model(X_cat, X_cont).squeeze(1)
            loss = criterion(outputs, y)
            total_loss += loss.item()
    avg_loss = total_loss / len(data_loader)
    return avg_loss

test_loss = evaluate(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.4f}')


Test Loss: 0.9992


### GCN

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=75e550ed-719f-4d72-aa2b-dbe53a6ba1db' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>