# Importing Necessary Libraries

In [1]:
import pandas as pd
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
import numpy as np

In [2]:
df = pd.read_csv('final_whiskey_data.csv')
df = df.drop('Unnamed: 0', axis = 1)
df.shape

(41018, 7)

In [3]:
pd.options.display.max_colwidth = 10000

In [4]:
df.head()


Unnamed: 0,rating,url,whiskey,User_ID,Whiskey_ID,price(euro),price(dollar)
0,88,https://www.whiskybase.com/whiskies/whisky/128130/1770-glasgow-single-malt,1770-glasgow-single-malt,852,0,50.88,56.99
1,87,https://www.whiskybase.com/whiskies/whisky/128130/1770-glasgow-single-malt,1770-glasgow-single-malt,459,0,50.88,56.99
2,86,https://www.whiskybase.com/whiskies/whisky/128130/1770-glasgow-single-malt,1770-glasgow-single-malt,1205,0,50.88,56.99
3,85,https://www.whiskybase.com/whiskies/whisky/128130/1770-glasgow-single-malt,1770-glasgow-single-malt,547,0,50.88,56.99
4,84,https://www.whiskybase.com/whiskies/whisky/128130/1770-glasgow-single-malt,1770-glasgow-single-malt,562,0,50.88,56.99


In [5]:
df['whiskey'] = df['whiskey'].str.replace('-', ' ').str.title()

# Need to make a Dataframe with only User, Item, and Rating for Surprise

In [6]:
rec_df = df.drop(columns = ['url', 'price(euro)', 'price(dollar)'])
rec_df = rec_df[['User_ID', 'Whiskey_ID', 'rating']]

# DataFrame with unique WhiskeyID's to provide all the information from the recommender

In [7]:
rec_match = df[['url', 'whiskey', 'Whiskey_ID', 'price(dollar)', 'price(euro)']].drop_duplicates('Whiskey_ID')

In [8]:
rec_match.head()

Unnamed: 0,url,whiskey,Whiskey_ID,price(dollar),price(euro)
0,https://www.whiskybase.com/whiskies/whisky/128130/1770-glasgow-single-malt,1770 Glasgow Single Malt,0,56.99,50.88
6,https://www.whiskybase.com/whiskies/whisky/125010/a-dream-of-scotland-islay-cask-strength-bw,A Dream Of Scotland Islay Cask Strength Bw,1,89.59,79.99
53,https://www.whiskybase.com/whiskies/whisky/126951/a-dream-of-scotland-the-old-lady-of-islay-bw,A Dream Of Scotland The Old Lady Of Islay Bw,2,87.25,77.9
85,https://www.whiskybase.com/whiskies/whisky/129452/aberfeldy-1996-ca,Aberfeldy 1996 Ca,3,97.44,87.0
87,https://www.whiskybase.com/whiskies/whisky/131516/aberfeldy-1998,Aberfeldy 1998,4,319.2,285.0


In [9]:
rec_df.User_ID.max()

1222

# Fitting the model with our data

In [10]:
from surprise import Reader, Dataset
reader = Reader(rating_scale = (1,100))
data = Dataset.load_from_df(rec_df,reader)

In [11]:
from surprise.model_selection import train_test_split
from surprise import accuracy
# Splitting the data to see how accurate our model will be predicting a rating
trainset, testset = train_test_split(data, test_size=0.25, random_state = 100)

In [12]:
svd = SVD()
svd.fit(trainset)
preds = svd.test(testset)

In [45]:
# Rating the Model
accuracy.rmse(preds)

RMSE: 2.3796


2.3796237775556155

# Test function to see how well our model predicts

In [36]:
def recommend_whiskey(uid, iid, actual):
    pred = svd.predict(uid, iid, actual, verbose=True)
    match = rec_match.loc[rec_match['Whiskey_ID'] == iid]
    return match

In [37]:
recommend_whiskey(852,0, 88)

user: 852        item: 0          r_ui = 88.00   est = 89.20   {'was_impossible': False}


Unnamed: 0,url,whiskey,Whiskey_ID,price(dollar),price(euro)
0,https://www.whiskybase.com/whiskies/whisky/128130/1770-glasgow-single-malt,1770 Glasgow Single Malt,0,56.99,50.88


# Model Tuning

### GridSearch

In [75]:
## Perform a gridsearch with SVD
params = {'n_factors' :[30,31,32,33,34,35,36,37,38,40],
         'reg_all':[0.02,0.05,0.1]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(data)
#rmse: 2.33

In [77]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 2.3328953721149546, 'mae': 1.5086242093244597}
{'rmse': {'n_factors': 34, 'reg_all': 0.1}, 'mae': {'n_factors': 38, 'reg_all': 0.1}}


In [88]:
## Perform a gridsearch with SVD
params = {'n_factors' :[30,31,32,33,34,35,36,37,38,40],
         'reg_all':[0.02,0.05,0.1],
         'n_epochs':(20,40,60)}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(data)


In [89]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 2.3136214053763138, 'mae': 1.4924415619575369}
{'rmse': {'n_factors': 40, 'reg_all': 0.1, 'n_epochs': 40}, 'mae': {'n_factors': 40, 'reg_all': 0.1, 'n_epochs': 40}}


### KNN_basic

In [84]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options={'name':'pearson','user_based':True, 'k':(40, 60, 70, 80, 90, 100, 120), 'min_k':(1,2,3,4)})
cv_knn_basic= cross_validate(knn_basic,data,n_jobs=-1)

In [85]:
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

('test_rmse', array([2.89629519, 2.84793825, 2.46140974, 2.71759606, 2.55949326]))
('test_mae', array([1.72919834, 1.76436181, 1.67818883, 1.72511326, 1.72328209]))
('fit_time', (1.0760939121246338, 1.2453320026397705, 1.5318851470947266, 1.59450101852417, 1.1700971126556396))
('test_time', (2.4097161293029785, 2.6374258995056152, 2.2420060634613037, 1.6377956867218018, 1.2237699031829834))
-----------------------
2.696546500424127


### KNN_baseline

In [86]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options={'name':'pearson','user_based':True, 'k':(40, 60, 80, 100, 120), 
                                        'min_k':(1,2,3,4)})
cv_knn_baseline = cross_validate(knn_baseline,data)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [87]:
for i in cv_knn_baseline.items():
    print(i)

np.mean(cv_knn_baseline['test_rmse'])

('test_rmse', array([2.40336211, 2.34713519, 2.29184318, 2.4846802 , 2.36150075]))
('test_mae', array([1.53194464, 1.54741206, 1.54163633, 1.55180007, 1.5497515 ]))
('fit_time', (0.9768247604370117, 0.9011051654815674, 0.8609700202941895, 0.8791882991790771, 0.8943300247192383))
('test_time', (1.7123231887817383, 1.3695590496063232, 1.3944129943847656, 1.422109842300415, 1.380979061126709))


2.3777042860619977

# New model with the best parameters

In [93]:
svd = SVD(n_factors= 40, reg_all= 0.1, n_epochs= 40)
svd.fit(trainset)
preds = svd.test(testset)

In [94]:
# Rating the Model
accuracy.rmse(preds)

RMSE: 2.3076


2.307609168044974

In [95]:
def recommend_whiskey(uid, iid, actual):
    pred = svd.predict(uid, iid, actual, verbose=True)
    match = rec_match.loc[rec_match['Whiskey_ID'] == iid]
    return match

In [96]:
recommend_whiskey(852,0, 88)

user: 852        item: 0          r_ui = 88.00   est = 88.57   {'was_impossible': False}


Unnamed: 0,url,whiskey,Whiskey_ID,price(dollar),price(euro)
0,https://www.whiskybase.com/whiskies/whisky/128130/1770-glasgow-single-malt,1770 Glasgow Single Malt,0,56.99,50.88


# Helper function to return recommendations to the user. Used in the whiskey_rater function below

In [97]:
## add the new ratings to the original ratings DataFrame
def get_recommendations(user_ratings, num_recs, priceUSD):
    new_ratings_df = rec_match.append(user_ratings,ignore_index=True)
    new_ratings_df = new_ratings_df[new_ratings_df['price(dollar)'] < priceUSD]
    new_ratings_df = new_ratings_df.drop(columns=['url', 'whiskey', 'price(dollar)', 'price(euro)'])
#     new_ratings_df = rec_df.append(user_ratings,ignore_index=True)
    #load in new df
    new_data = Dataset.load_from_df(new_ratings_df,reader)
    #create new svd object
    svd_new = SVD(n_factors= 40, reg_all= 0.1, n_epochs= 40)
    #re fit the model
    svd_new.fit(new_data.build_full_trainset())

    # make predictions for the user
    list_of_whiskies = []
    for w_id in new_ratings_df['Whiskey_ID'].unique():
        list_of_whiskies.append((w_id, svd_new.predict(new_ratings_df['User_ID'].max(),w_id)[3]))

    # order the predictions from highest to lowest rated
    ranked_whiskies = sorted(list_of_whiskies, key=lambda x:x[1],reverse=True)

    rec_num = 1
    for i in ranked_whiskies[:num_recs]:
#         match = rec_match[rec_match['price(dollar)'] < priceUSD]
        recommended = rec_match[rec_match['Whiskey_ID'] == i[0]]
        print('Recommendation number:', rec_num)
        print('Whiskey: ' + recommended.values[0][1])
        print('Price: ' + str(recommended.values[0][-1]))
        print('URL: ' + recommended.values[0][0])
        print('\n')
        rec_num +=1



In [98]:
def whiskey_rater(df, num):
    userID = rec_df.User_ID.max()+1
    num_recs = input('How many recommendations would you like? Please enter a number from 1 to 10:\n')
    while int(num_recs) > 10:
        num_recs = input('You entered a number over 10. Please enter a number from 1 to 10 to continue. \n')
    priceUSD = input('Please enter your budget for a bottle of whiskey.')
    rating_list = []
    while num > 0:
        whiskey = df[df['price(dollar)'] < int(priceUSD)].sample(1)
        print('\nPlease rate the following {} whiskies. \n'.format(num))
        print('Whiskey: ' + whiskey.values[0][2])
        print('Price: ' + str(whiskey.values[0][-1]))
        print('URL: ' + whiskey.values[0][1])
        rating = input('How do you rate this whiskey on a scale of 1-100, press n if you are not familiar with it. :\n')
        if int(rating) > 100:
            rating = 100
            
        if rating == 'n':
            continue
        else:
            rating_one_whiskey = {'User_ID':userID,'Whiskey_ID': whiskey['Whiskey_ID'].values[0],'rating': rating}
            rating_list.append(rating_one_whiskey) 
            num -= 1
    print('\n'+'-----Making Recommendations-----'+'\n')
    get_recommendations(rating_list, int(num_recs), int(priceUSD))
        

In [99]:
user_ratings= whiskey_rater(df, 5)

How many recommendations would you like? Please enter a number from 1 to 10:
5
Please enter your budget for a bottle of whiskey.1000

Please rate the following 5 whiskies. 

Whiskey: Macallan 1964
Price: 952.0
URL: https://www.whiskybase.com/whiskies/whisky/54993/macallan-1964
How do you rate this whiskey on a scale of 1-100, press n if you are not familiar with it. :
100

Please rate the following 4 whiskies. 

Whiskey: Port Ellen 1969 Gm
Price: 0.0
URL: https://www.whiskybase.com/whiskies/whisky/95396/port-ellen-1969-gm
How do you rate this whiskey on a scale of 1-100, press n if you are not familiar with it. :
80

Please rate the following 3 whiskies. 

Whiskey: Glenfarclas 1967
Price: 436.8
URL: https://www.whiskybase.com/whiskies/whisky/12187/glenfarclas-1967
How do you rate this whiskey on a scale of 1-100, press n if you are not familiar with it. :
80

Please rate the following 2 whiskies. 

Whiskey: Glen Grant 1964 Jm
Price: 436.8
URL: https://www.whiskybase.com/whiskies/whisky