In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.cluster import KMeans

# import and run file
from importnb import imports
with imports("ipynb"):
    import data_preparation
%run data_preparation.ipynb

In [2]:
# aggregate data for each unique user
data = reviews[['Reviewer Name', 'Birth Year', 'Marital Status', 'Has Children?',
                'Vegetarian?', 'Weight (lb)', 'Height (in)', 'Average Amount Spent',
                'Preferred Mode of Transport', 'Northwestern Student?']]

data_users = data.groupby(['Reviewer Name']).agg(**{
    'Birth Year': ('Birth Year', lambda x: pd.Series.mode(x)[0]),
    'Marital Status': ('Marital Status', lambda x: pd.Series.mode(x)[0]),
    'Has Children?': ('Has Children?', lambda x: pd.Series.mode(x)[0]),
    'Vegetarian?': ('Vegetarian?', lambda x: pd.Series.mode(x)[0]),
    'Weight (lb)': ('Weight (lb)', 'mean'),
    'Height (in)': ('Height (in)', 'mean'),
    'Average Amount Spent': ('Average Amount Spent', lambda x: pd.Series.mode(x)[0]),
    'Preferred Mode of Transport': ('Preferred Mode of Transport', lambda x: pd.Series.mode(x)[0]),
    'Northwestern Student?': ('Northwestern Student?', lambda x: pd.Series.mode(x)[0])
}).reset_index()

data_users = pd.get_dummies(
    data_users,
    columns=['Marital Status', 'Has Children?', 'Vegetarian?', 'Average Amount Spent',
             'Preferred Mode of Transport', 'Northwestern Student?'],
    drop_first=True, dtype=int
)

data_demographics = data_users.drop(columns=['Reviewer Name'])

In [None]:
# user scores of each restaurant
data_scores = reviews[['Reviewer Name', 'Restaurant Name', 'Rating']].groupby(['Reviewer Name', 'Restaurant Name']).agg(Rating = ('Rating', 'mean')).reset_index()
data_scores_table = data_scores.pivot(index='Restaurant Name', columns='Reviewer Name', values='Rating').reset_index()
# only consider restaurants that we have information on
restaurants_considered = list(restaurants['Restaurant Name'])
restaurants_considered.remove('Evanston Games & Cafe')
data_scores_table = data_scores_table[data_scores_table['Restaurant Name'].isin(restaurants_considered)]

In [3]:
# k-means clustering in order to impute sparseness

# first prepare the data for clustering
data_users_num = data_demographics[['Birth Year', 'Weight (lb)', 'Height (in)']]
data_users_cat = data_demographics.drop(columns=['Birth Year', 'Weight (lb)', 'Height (in)'])

scaler = StandardScaler()
data_users_scaled = scaler.fit_transform(data_users_num)
data_users_scaled = np.column_stack((data_users_scaled, data_users_cat))

# find which cluster than each user belong to
kmeans_labels = KMeans(n_clusters=4, n_init=10, max_iter=10).fit_predict(data_users_scaled)



In [None]:
# merge user names with which cluster they belong to
data_users_clustered = np.column_stack((data_users['Reviewer Name'], kmeans_labels))
data_users_clustered = pd.DataFrame(data_users_clustered, columns=['Reviewer Name', 'cluster'])
data_scores_clustered = data_scores.merge(data_users_clustered, left_on='Reviewer Name', right_on='Reviewer Name')
# find average rating by restaurant for each cluster
avg_scores_clustered = data_scores_clustered.groupby(['cluster', 'Restaurant Name'])['Rating'].mean().unstack(level=0)

# if ratings still missing, impute restaurant rating as average rating of each cluster
avg_scores_clustered = avg_scores_clustered.apply(lambda row: row.fillna(row.mean()), axis=1).reset_index()
avg_scores_clustered = pd.melt(avg_scores_clustered, id_vars='Restaurant Name', value_vars=[0, 1, 2, 3]).reset_index()
# merge user name with their cluster data
data_user_avg_scores_clustered = data_users_clustered.merge(avg_scores_clustered, how='right', on='cluster')
data_user_avg_scores_clustered = data_user_avg_scores_clustered[['Reviewer Name', 'Restaurant Name', 'value']]
data_avg_scores_table = data_user_avg_scores_clustered.pivot(index='Restaurant Name', columns='Reviewer Name', values='value').reset_index()
data_avg_scores_table = data_avg_scores_table[data_avg_scores_table['Restaurant Name'].isin(restaurants_considered)]
# impute missingness in user scores of each restaurant
data_scores_table_complete = data_scores_table.fillna(data_avg_scores_table)

In [None]:
user_scores_cosine = pd.DataFrame(
    cosine_distances(
        data_scores_table_complete.drop(columns=['Restaurant Name']).T,
        data_scores_table_complete.drop(columns=['Restaurant Name']).T
    ),
    columns=data_scores_table_complete.columns[1:],
    index=data_scores_table_complete.columns[1:]
)

In [None]:
def collab_filter_scores(name, n_similar):
    most_similar_users = user_scores_cosine[user_scores_cosine.index != name][name].sort_values(ascending=True).index[0:n_similar]
    possible_recs = reviews[reviews['Reviewer Name'].isin(most_similar_users)].groupby(['Reviewer Name'])
    top_recs = possible_recs.apply(lambda x: x[x['Rating'] == x['Rating'].max()])
    return( top_recs[['Restaurant Name', 'Rating']].reset_index().drop(columns=['level_1']) )

In [None]:
print(collab_filter_scores('Timothy Mace', 2))