In [43]:
# imports
import random as rd
import pandas as pd
import numpy as np
from typing import List, Tuple
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
from scipy import stats
from surprise import AlgoBase
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import NMF
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

In [44]:
# constants and initialization
item_threshold = 1 # 1 means no filtering
my_seed = 0
rd.seed(my_seed)
np.random.seed(my_seed)
predict_col = 'artist'
top_fraction = 0.2
user_events_file = 'data/user_events.csv'
low_user_file = 'data/low_main_users.csv'
medium_user_file = 'data/medium_main_users.csv'
high_user_file = 'data/high_main_users.csv'
performance_data_file = 'data/performance_data.csv'
low_mae_data_file = 'data/low_mae_data.csv'
med_mae_data_file = 'data/med_mae_data.csv'
high_mae_data_file = 'data/high_mae_data.csv'
performance_data_cv_file = 'data/performance_data_cv.csv'
low_mae_data_cv_file = 'data/low_mae_data_cv.csv'
med_mae_data_cv_file = 'data/med_mae_data_cv.csv'
high_mae_data_cv_file = 'data/high_mae_data_cv.csv'
two_sample_ttest_data_file = 'data/two_sample_ttest_data.csv'
two_sample_ttest_data_cv_file = 'data/two_sample_ttest_data_cv.csv'
paired_sample_ttest_data_file = 'data/paired_sample_ttest_data.csv'
paired_sample_ttest_data_cv_file = 'data/paired_sample_ttest_data_cv.csv'

In [45]:
# read user events
cols = ['user', 'artist', 'album', 'track', 'timestamp']
df_events = pd.read_csv(user_events_file, sep='\t', names=cols)
print('No. of user events: ' + str(len(df_events)))

No. of user events: 28718087


In [46]:
# create user-item matrix
df_events = df_events.groupby(['user', predict_col]).size().reset_index(name='count')
print('No. user-item interactions: ' + str(len(df_events)))

No. user-item interactions: 1755361


In [47]:
df_events.head()

Unnamed: 0,user,artist,count
0,1021445,12,43
1,1021445,16,1
2,1021445,28,7
3,1021445,29,1
4,1021445,46,1


In [48]:
df_events = df_events[df_events['count'] >= item_threshold]
print('No. filtered user events: ' + str(len(df_events)))
print('No. filtered items: ' + str(len(df_events[predict_col].unique())))

No. filtered user events: 1755361
No. filtered items: 352805


In [49]:
# get user distribution
user_dist = df_events['user'].value_counts()
num_users = len(user_dist)
print('Mean artists per user: ' + str(user_dist.mean()))
print('Min artists per user: ' + str(user_dist.min()))
print('Max artists per user: ' + str(user_dist.max()))

Mean artists per user: 585.1203333333333
Min artists per user: 18
Max artists per user: 4011


In [50]:
# get item distribution
item_dist = df_events[predict_col].value_counts()
num_items = len(item_dist)
print('No. items: ' + str(num_items))

No. items: 352805


In [51]:
# get top items
num_top = int(top_fraction * num_items)
top_item_dist = item_dist[:num_top]
print('No. top items: ' + str(len(top_item_dist)))

No. top items: 70561


In [52]:
# read users
low_users = pd.read_csv(low_user_file, sep=',').set_index('user_id')
medium_users = pd.read_csv(medium_user_file, sep=',').set_index('user_id')
high_users = pd.read_csv(high_user_file, sep=',').set_index('user_id')
no_users = len(low_users) + len(medium_users) + len(high_users)
print('No. of users: ' + str(no_users))

No. of users: 3000


In [53]:
# get pop fractions
pop_count = [] # number of top items per user
user_hist = [] # user history sizes
pop_fraq = [] # relative number of top items per user
pop_item_fraq = [] # average popularity of items in user profiles
low_profile_size = 0
low_gap = 0
medium_profile_size = 0
medium_gap = 0
high_profile_size = 0
high_gap = 0
low_count = 0
med_count = 0
high_count = 0
for u, df in df_events.groupby('user'):
    no_user_items = len(set(df[predict_col])) # profile size
    no_user_pop_items = len(set(df[predict_col]) & set(top_item_dist.index)) # top items in profile
    pop_count.append(no_user_pop_items)
    user_hist.append(no_user_items)
    pop_fraq.append(no_user_pop_items / no_user_items)
    # get popularity (= fraction of users interacted with item) of user items and calculate average of it
    user_pop_item_fraq = sum(item_dist[df[predict_col]] / no_users) / no_user_items
    pop_item_fraq.append(user_pop_item_fraq)
    if u in low_users.index: # get user group-specific values
        low_profile_size += no_user_items
        low_gap += user_pop_item_fraq
        low_count += 1
    elif u in medium_users.index:
        medium_profile_size += no_user_items
        medium_gap += user_pop_item_fraq
        med_count += 1
    else:
        high_profile_size += no_user_items
        high_gap += user_pop_item_fraq
        high_count += 1
low_profile_size /= len(low_users)
medium_profile_size /= len(medium_users)
high_profile_size /= len(high_users)
low_gap /= len(low_users)
medium_gap /= len(medium_users)
high_gap /= len(high_users)
print('Low count (for check): ' + str(low_count))
print('Med count (for check): ' + str(med_count))
print('High count (for check): ' + str(high_count))

Low count (for check): 1000
Med count (for check): 1000
High count (for check): 1000


In [54]:
scaled_df_events = pd.DataFrame()
for user_id, group in df_events.groupby('user'):
    min_rating = group['count'].min()
    max_rating = group['count'].max()
    scaler = MinMaxScaler(feature_range=(1, 1000))
    scaled_ratings = scaler.fit_transform(group['count'].values.reshape(-1, 1).astype(float))
    new_rows = group.copy()
    new_rows['count'] = scaled_ratings
    scaled_df_events = scaled_df_events.append(new_rows)

scaled_df_events.head()
#scaled_df_events = scaled_df_events.set_index('user') # needed for new python/surprise version

Unnamed: 0,user,artist,count
0,1021445,12,184.222707
1,1021445,16,1.0
2,1021445,28,27.174672
3,1021445,29,1.0
4,1021445,46,1.0


In [55]:
df_events = scaled_df_events
print('Min rating: ' + str(df_events['count'].min()))
print('Max rating: ' + str(df_events['count'].max()))

Min rating: 1.0
Max rating: 1000.0000000000001


In [56]:
reader = Reader(rating_scale=(df_events['count'].min(), df_events['count'].max()))
df_events.head()

Unnamed: 0,user,artist,count
0,1021445,12,184.222707
1,1021445,16,1.0
2,1021445,28,27.174672
3,1021445,29,1.0
4,1021445,46,1.0


In [57]:
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_events, reader)

In [58]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [59]:
def get_top_n_random(testset, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r in testset:
        if len(top_n[uid]) == 0:
            for i in range(0, 10):
                top_n[uid].append((rd.choice(item_dist.index), i))
    return top_n

In [60]:
def get_top_n_mp(testset, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r in testset:
        if len(top_n[uid]) == 0:
            for iid, count in item_dist[:n].items():
                top_n[uid].append((iid, count))
    return top_n

In [61]:
def get_mae_of_groups(predictions) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    print('All: ')
    mae_all: float = accuracy.mae(predictions)
    low_predictions = []
    med_predictions = []
    high_predictions = []
    for uid, iid, true_r, est, details in predictions:
        prediction = [(uid, iid, true_r, est, details)]
        if uid in low_users.index:
            low_predictions.append(accuracy.mae(prediction, verbose=False))
        elif uid in medium_users.index:
            med_predictions.append(accuracy.mae(prediction, verbose=False))
        else:
            high_predictions.append(accuracy.mae(prediction, verbose=False))

    mae_low: float = np.mean(low_predictions)
    mae_med: float = np.mean(med_predictions)
    mae_high: float = np.mean(high_predictions)
    print('LowMS: ' + str(mae_low))
    print('MedMS: ' + str(mae_med))
    print('HighMS: ' + str(mae_high))
    # print(stats.ttest_ind(low_predictions, high_predictions))

    return (pd.DataFrame({'mae_all': [mae_all], 'mae_low': [mae_low], 'mae_med': [mae_med], 
    'mae_high': [mae_high]}), pd.DataFrame({'mae': low_predictions}), pd.DataFrame({'mae': med_predictions}), pd.DataFrame({'mae': high_predictions}))

In [62]:
# create item dataframe with normalized item counts
df_item_dist = pd.DataFrame(item_dist)
df_item_dist.columns = ['count']
df_item_dist['count'] /= no_users

In [77]:
def validate(trainset, testset) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    sim_users = {'name': 'cosine', 'user_based': True}  # compute cosine similarities between users
    algos = [] # Random and MostPopular is calculated by default
    algos.append(None)#Random())
    algos.append(None)#MostPopular())
    algos.append(BaselineOnly())
    algos.append(KNNBasic(sim_options = sim_users, k=40))
    algos.append(KNNWithMeans(sim_options = sim_users, k=40))
    algos.append(NMF(n_factors = 15))
    algo_names = ['Random',
                'MostPopular',
                'UserItemAvg',
                'UserKNN',
                'UserKNNAvg',
                'NMF']

    i = 0
    performance_list: List[pd.DataFrame] = []
    low_mae: List[pd.DataFrame] = []
    med_mae: List[pd.DataFrame] = []
    high_mae: List[pd.DataFrame] = []

    for i in range(0, len(algo_names)):
        df_item_dist[algo_names[i]] = 0
        low_rec_gap = 0
        medium_rec_gap = 0
        high_rec_gap = 0

        # get accuracy for personalized approaches
        if algo_names[i] is not 'Random' and algo_names[i] is not 'MostPopular':
            algos[i].fit(trainset)
            predictions = algos[i].test(testset)
            print(algo_names[i])
            performance, low_performance, med_performance, high_performance = get_mae_of_groups(predictions)

            performance["algo"] = algo_names[i]
            performance_list.append(performance)

            low_performance["algo"] = [algo_names[i]] * low_performance.shape[0]
            low_mae.append(low_performance)

            med_performance["algo"] = [algo_names[i]] * med_performance.shape[0]
            med_mae.append(med_performance)

            high_performance["algo"] = [algo_names[i]] * high_performance.shape[0]
            high_mae.append(high_performance)

    performance_data: pd.DataFrame = pd.concat(performance_list)
    low_mae_data: pd.DataFrame = pd.concat(low_mae)
    med_mae_data: pd.DataFrame = pd.concat(med_mae)
    high_mae_data: pd.DataFrame = pd.concat(high_mae)

    return (performance_data, low_mae_data, med_mae_data, high_mae_data)

  if algo_names[i] is not 'Random' and algo_names[i] is not 'MostPopular':
  if algo_names[i] is not 'Random' and algo_names[i] is not 'MostPopular':


In [78]:
trainset, testset = train_test_split(data, test_size = 0.2, random_state = my_seed)

performance_data, low_mae_data, med_mae_data, high_mae_data = validate(trainset=trainset, testset=testset)

performance_data.to_csv(performance_data_file, encoding='utf-8')
low_mae_data.to_csv(low_mae_data_file, encoding='utf-8')
med_mae_data.to_csv(med_mae_data_file, encoding='utf-8')
high_mae_data.to_csv(high_mae_data_file, encoding='utf-8')

Estimating biases using als...
UserItemAvg
All: 
MAE:  38.5612
LowMS: 42.94802225638076
MedMS: 33.90013072887102
HighMS: 40.68639747115602
Computing the cosine similarity matrix...
Done computing similarity matrix.
UserKNN
All: 
MAE:  45.6320
LowMS: 49.75989995441734
MedMS: 42.483604584035085
HighMS: 45.99103663278319
Computing the cosine similarity matrix...
Done computing similarity matrix.
UserKNNAvg
All: 
MAE:  41.8842
LowMS: 46.58083982804533
MedMS: 37.58534841057563
HighMS: 43.2426341108826
NMF
All: 
MAE:  34.8523
LowMS: 38.415371883576334
MedMS: 30.62654021209281
HighMS: 37.16152437455301


In [79]:
performance_data = pd.read_csv(performance_data_file, encoding='utf-8')
low_mae_data = pd.read_csv(low_mae_data_file, encoding='utf-8')
med_mae_data = pd.read_csv(med_mae_data_file, encoding='utf-8')
high_mae_data = pd.read_csv(high_mae_data_file, encoding='utf-8')

In [80]:
p_values_low_high: List[float] = []
t_score_low_high: List[float] = []

for algo in low_mae_data['algo'].unique():
    low_mae = low_mae_data[low_mae_data['algo'] == algo]['mae'].tolist()
    high_mae = high_mae_data[high_mae_data['algo'] == algo]['mae'].tolist()
    t_stat, p = stats.ttest_ind(low_mae, high_mae)

    p_values_low_high.append(p)
    t_score_low_high.append(t_stat)

two_sample_ttest_data: pd.DataFrame = pd.DataFrame({'p_value': p_values_low_high, 't_score': t_score_low_high})

two_sample_ttest_data.to_csv(two_sample_ttest_data_file, encoding='utf-8')


In [84]:
kf = KFold(n_splits=5, random_state=my_seed)
performance_data_cv_list: List[pd.DataFrame] = []
low_mae_data_cv_list: List[pd.DataFrame] = []
med_mae_data_cv_list: List[pd.DataFrame] = []
high_mae_data_cv_list: List[pd.DataFrame] = []

for idx, (trainset_cv, testset_cv) in enumerate(kf.split(data)):
    print('fold:' + str(idx))
    performance_data_cv, low_mae_data_cv, med_mae_data_cv, high_mae_data_cv = validate(trainset=trainset_cv, testset=testset_cv)

    performance_data_cv["fold"] = idx
    low_mae_data_cv["fold"] = [idx] * low_mae_data_cv.shape[0]
    med_mae_data_cv["fold"] = [idx] * med_mae_data_cv.shape[0]
    high_mae_data_cv["fold"] = [idx] * high_mae_data_cv.shape[0]

    performance_data_cv_list.append(performance_data_cv)
    low_mae_data_cv_list.append(low_mae_data_cv)
    med_mae_data_cv_list.append(med_mae_data_cv)
    high_mae_data_cv_list.append(high_mae_data_cv)
    
performance_data_cv_data: pd.DataFrame = pd.concat(performance_data_cv_list)
low_mae_data_cv_data: pd.DataFrame = pd.concat(low_mae_data_cv_list)
med_mae_data_cv_data: pd.DataFrame = pd.concat(med_mae_data_cv_list)
high_mae_data_cv_data: pd.DataFrame = pd.concat(high_mae_data_cv_list)

performance_data_cv_data.to_csv(performance_data_cv_file, encoding='utf-8')
low_mae_data_cv_data.to_csv(low_mae_data_cv_file, encoding='utf-8')
med_mae_data_cv_data.to_csv(med_mae_data_cv_file, encoding='utf-8')
high_mae_data_cv_data.to_csv(high_mae_data_cv_file, encoding='utf-8')

fold:0
Estimating biases using als...
UserItemAvg
All: 
MAE:  38.5778
LowMS: 43.20939266538006
MedMS: 34.009743511116035
HighMS: 40.354254467042935
Computing the cosine similarity matrix...
Done computing similarity matrix.
UserKNN
All: 
MAE:  45.6260
LowMS: 49.845253223588436
MedMS: 42.7074971216775
HighMS: 45.593557982682206
Computing the cosine similarity matrix...
Done computing similarity matrix.
UserKNNAvg
All: 
MAE:  41.8959
LowMS: 46.79212662418838
MedMS: 37.696166555642954
HighMS: 42.93792752845092
NMF
All: 
MAE:  34.7358
LowMS: 38.47888991164326
MedMS: 30.61361159908839
HighMS: 36.742829501939234
fold:1
Estimating biases using als...
UserItemAvg
All: 
MAE:  38.6710
LowMS: 42.95710287284193
MedMS: 34.30439121452419
HighMS: 40.47191653816627
Computing the cosine similarity matrix...
Done computing similarity matrix.
UserKNN
All: 
MAE:  45.7612
LowMS: 49.5074822479507
MedMS: 43.0304446382831
HighMS: 45.89593262230686
Computing the cosine similarity matrix...
Done computing simil

In [85]:
performance_data_cv_data = pd.read_csv(performance_data_cv_file, encoding='utf-8')
low_mae_data_cv_data = pd.read_csv(low_mae_data_cv_file, encoding='utf-8')
med_mae_data_cv_data = pd.read_csv(med_mae_data_cv_file, encoding='utf-8')
high_mae_data_cv_data = pd.read_csv(high_mae_data_cv_file, encoding='utf-8')