In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
import functools
import statsmodels.formula.api as smf
@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()
import scipy.stats as st

In [3]:
df = pd.read_csv('UserAnimeList.csv')

In [4]:
filtered_df = df[['username', 'anime_id', 'my_score']].loc[lambda x: x['my_score'] != 0]

In [None]:
def read_xml(file, username):
    import xml.etree.ElementTree as ET

    xml_data = open(file, 'r').read()  # Read file
    root = ET.XML(xml_data)  # Parse XML

    data = []
    cols = []
    for i, child in enumerate(root):
        data.append([subchild.text for subchild in child])
        cols.append(child.tag)
    new_list = pd.DataFrame(data).T
    new_list.columns = cols

    df = new_list.loc[[0, 9]].T.dropna().rename({0: 'anime_id', 9: 'my_score'}, axis=1)
    df['username'] = username
    df['anime_id'] = df['anime_id'].astype(int)
    df['my_score'] = df['my_score'].astype(int)
    df['username'] = df['username'].astype(str)
    df = df.loc[lambda x: x['my_score'] != 0]
    df = df.reset_index(drop=True)
    return df

def add_user(full_df, xml_file, username):
    user_df = read_xml(xml_file, username)
    without_user = full_df.loc[lambda x: x['username'] != username]
    return pd.concat([without_user, user_df], ignore_index=True)

In [None]:
filtered_df = add_user(filtered_df, 'user_profiles/Fro116.xml', 'Fro116')

In [5]:
average_rating = filtered_df['my_score'].mean()
user_bias = pd.DataFrame(filtered_df.groupby('username')['my_score'].mean()).rename(
    {'my_score': 'user_bias'}, axis=1) - average_rating
anime_bias = pd.DataFrame(filtered_df.groupby('anime_id')['my_score'].mean()).rename(
    {'my_score': 'anime_bias'}, axis=1) - average_rating

In [6]:
filtered_df = filtered_df.merge(anime_bias, on = ['anime_id']).merge(user_bias, on = ['username'])
filtered_df['normalized_score'] = (filtered_df['my_score'] 
    - filtered_df['anime_bias'] - filtered_df['user_bias'] - average_rating)
filtered_df = filtered_df.set_index('username')
filtered_df = filtered_df.dropna()

In [7]:
def prepare_prediction(recommendee, neighborhood):
    pred_df = pd.DataFrame()
    pred_df['delta'] = neighborhood.groupby('anime_id').apply(
        lambda x: np.dot(x['normalized_score'], x['corr']) / x['corr'].abs().sum()) 
    pred_df['blp'] = anime_bias + user_bias.loc[recommendee].squeeze()  + average_rating
    pred_df = pred_df.dropna()    
    return pred_df

In [8]:
def get_correlation(df, recommendee):
    user_subset = df.loc[[recommendee]].merge(df.reset_index(), on = 'anime_id')
    adj_cos_corr_numerator = user_subset.groupby('username').apply(
        lambda x: np.dot(x['normalized_score_x'], x['normalized_score_y']))
    adj_cos_corr_denom = df.groupby('username').apply(
        lambda x: np.sqrt(np.dot(x['normalized_score'], x['normalized_score'])))
    adj_cos_corr_denom *= adj_cos_corr_denom.loc[recommendee]
    adj_cos_corrs = pd.DataFrame((adj_cos_corr_numerator / adj_cos_corr_denom), columns=['corr'])
    adj_cos_corrs = adj_cos_corrs.dropna()
    return adj_cos_corrs

In [9]:
def get_errors(df, pred_df, recommendee):
    recommendee_df = pred_df.loc[pred_df.index.intersection(df.loc[recommendee].anime_id)]
    recommendee_df = recommendee_df.merge(df.loc[recommendee].set_index('anime_id')['my_score'], on ='anime_id')
    errors = (recommendee_df['my_score'] - recommendee_df['score'])
    if errors.empty:
        return np.nan
    rmse = np.sqrt(np.dot(errors, errors) / len(errors))
    return rmse

In [10]:
def compute_mean_squared_errors(df, recommendee, neighborhood_sizes):
    # training/test split
    test_split = 0.10
    num_seen_shows = len(df.loc[recommendee])
    oos_indices = random.sample(list(range(num_seen_shows)), int(num_seen_shows * test_split))
    oos_df = df.loc[recommendee].iloc[oos_indices]
    is_df = df.loc[lambda x: ~((x.index.get_level_values('username') == recommendee) & x.anime_id.isin(oos_df.anime_id))]

    corrs = get_correlation(is_df, recommendee)
    corrs['similarity'] = corrs['corr'].abs()
    corrs = corrs.sort_values(by='similarity').dropna()
    corrs = corrs.drop(recommendee) # makes insample score more meaningful
    
    rmse_errors = pd.DataFrame({'neighborhood_size': [], 'is_rmse': [], 'is_count': [],
                           'oos_rmse': [], 'oos_count': [],
                           'total_count': []
                          })    
    for neighborhood_size in neighborhood_sizes:
        neighborhood = (df.merge(pd.DataFrame(corrs[-neighborhood_size:]), on = 'username')).dropna()
        pred_df = prepare_prediction(recommendee, neighborhood)

        recomendee_seen_shows = is_df.loc[recommendee].merge(pred_df, on = ['anime_id'])
        recomendee_seen_shows['target'] = recomendee_seen_shows['my_score'] - recomendee_seen_shows['blp']
        model = lm('target ~ delta + 0', recomendee_seen_shows)
        
        pred_df['score'] = model.predict(pred_df) + pred_df['blp']
        is_pred_df = pred_df.loc[lambda x: ~x.index.isin(oos_df.anime_id)]
        oos_pred_df = pred_df.loc[lambda x: x.index.isin(oos_df.anime_id)]

        is_rmse = get_errors(df, is_pred_df, recommendee)
        oos_rmse = get_errors(df, oos_pred_df, recommendee)
        
        rmse_errors = rmse_errors.append({
            'neighborhood_size': neighborhood_size, 'is_rmse': is_rmse, 'is_count': len(is_pred_df),
            'oos_rmse': oos_rmse, 'oos_count': len(oos_pred_df),'total_count': len(oos_df)
            }, ignore_index=True)
    
    return rmse_errors

In [11]:
%%time
neighborhood_sizes = [2**i for i in range(int(np.log(len(filtered_df))) + 1)] + [0]
compute_mean_squared_errors(filtered_df, 'Fro116', neighborhood_sizes)

CPU times: user 4min 35s, sys: 52.9 s, total: 5min 28s
Wall time: 6min 20s


Unnamed: 0,neighborhood_size,is_rmse,is_count,oos_rmse,oos_count,total_count
0,1.0,1.13582,21.0,1.759427,1.0,16.0
1,2.0,1.571802,86.0,4.214271,2.0,16.0
2,4.0,1.50516,100.0,3.800794,3.0,16.0
3,8.0,1.553129,110.0,3.442477,4.0,16.0
4,16.0,1.466288,130.0,2.02721,4.0,16.0
5,32.0,1.379385,484.0,1.448169,8.0,16.0
6,64.0,1.397407,927.0,2.037591,11.0,16.0
7,128.0,1.319221,1554.0,1.965931,13.0,16.0
8,256.0,1.133303,2130.0,1.877851,14.0,16.0
9,512.0,0.938076,3109.0,1.792406,14.0,16.0
