### Importing the dataset

In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import joblib

In [13]:
df = pd.read_csv('../dataset/XWines_with_Ratings.csv', low_memory=False)

In [14]:
df.head()

Unnamed: 0,RatingID,UserID,Rating,WineName,Vintage,Type,Elaborate,ABV,Body,Acidity,...,Harmonize12,Grapes1,Grapes2,Grapes3,Grapes4,Grapes5,Grapes6,Grapes7,Grapes8,Grapes9
0,326545,1756594,4.0,Espumante Moscatel,1999,Sparkling,Varietal/100%,8.0,Mediumbodied,High,...,,Muscat/Moscato,,,,,,,,
1,1314107,1219305,2.5,Espumante Moscatel,2007,Sparkling,Varietal/100%,8.0,Mediumbodied,High,...,,Muscat/Moscato,,,,,,,,
2,1446366,2047929,3.5,Espumante Moscatel,2008,Sparkling,Varietal/100%,8.0,Mediumbodied,High,...,,Muscat/Moscato,,,,,,,,
3,1448872,1006545,5.0,Espumante Moscatel,2008,Sparkling,Varietal/100%,8.0,Mediumbodied,High,...,,Muscat/Moscato,,,,,,,,
4,1657104,1400823,2.0,Espumante Moscatel,2008,Sparkling,Varietal/100%,8.0,Mediumbodied,High,...,,Muscat/Moscato,,,,,,,,


### Creating a Pivot Table to Convert User Ratings to a Wide Format

In [15]:
df2 = pd.pivot_table(df, values=['Rating'],
                            index=['WineName', 'UserID'],
                            aggfunc='mean').unstack()
df2 = df2.fillna(0)
df2

Unnamed: 0_level_0,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating
UserID,1000004,1000010,1000021,1000023,1000024,1000025,1000027,1000029,1000031,1000036,...,2058533,2060068,2060099,2060262,2060383,2061042,2061195,2062232,2062388,2062618
WineName,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
10 Anos Tawny Porto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Year Old Tawny Port,3.5,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Year Old Tawny Porto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Moscato d Asti,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 e Mezzo Primitivo del Salento,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zweigelt Classic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zweigeltrebe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Úrágya Tokaji Furmint,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ürziger Würzgarten Riesling Kabinett,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Calculating Cosine Similarity and Euclidean Distance Matrices

In [16]:
cosine_sim = cosine_similarity(df2)
cosine_sim_df = pd.DataFrame(cosine_sim, index=df2.index, columns=df2.index)
cosine_sim_df

WineName,10 Anos Tawny Porto,10 Year Old Tawny Port,10 Year Old Tawny Porto,101 Moscato d Asti,12 e Mezzo Primitivo del Salento,20 Anos Tawny Porto,20 Year Old Tawny Port,20 Years Old Tawny Porto,21 Gables Chenin Blanc,A Galet Rosado,...,Winemaker s Collection Blanc de Noir,Winemaker s Collection Montepulciano d Abruzzo,Woodthorpe Vineyard Syrah,Zinfandel,Zweigelt,Zweigelt Classic,Zweigeltrebe,Úrágya Tokaji Furmint,Ürziger Würzgarten Riesling Kabinett,Красностоп Золотовский Krasnostop Zolotovskiy
WineName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Anos Tawny Porto,1.000000,0.045213,0.000000,0.000000,0.011653,0.000000,0.021286,0.000000,0.038204,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.018372
10 Year Old Tawny Port,0.045213,1.000000,0.025639,0.047145,0.073112,0.010292,0.062177,0.043146,0.061025,0.000000,...,0.000000,0.007823,0.031786,0.01283,0.008708,0.000000,0.018565,0.016206,0.040775,0.028322
10 Year Old Tawny Porto,0.000000,0.025639,1.000000,0.000000,0.006419,0.000000,0.023941,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.061881,0.000000,0.000000,0.000000,0.000000,0.000000
101 Moscato d Asti,0.000000,0.047145,0.000000,1.000000,0.007088,0.000000,0.031213,0.000000,0.039055,0.000000,...,0.000000,0.000000,0.064360,0.00000,0.000000,0.042447,0.000000,0.000000,0.021028,0.020434
12 e Mezzo Primitivo del Salento,0.011653,0.073112,0.006419,0.007088,1.000000,0.012885,0.038477,0.011343,0.033697,0.010544,...,0.016346,0.008747,0.016848,0.00000,0.005451,0.013545,0.000000,0.006284,0.006710,0.031559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zweigelt Classic,0.000000,0.000000,0.000000,0.042447,0.013545,0.000000,0.000000,0.000000,0.041141,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
Zweigeltrebe,0.000000,0.018565,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.023530,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
Úrágya Tokaji Furmint,0.000000,0.016206,0.000000,0.000000,0.006284,0.000000,0.000000,0.000000,0.031023,0.000000,...,0.000000,0.000000,0.112958,0.00000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000
Ürziger Würzgarten Riesling Kabinett,0.000000,0.040775,0.000000,0.021028,0.006710,0.000000,0.000000,0.029911,0.054026,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.063673,0.000000,0.000000,0.000000,1.000000,0.000000


In [17]:
dist_euc = euclidean_distances(df2)
dist_euc_df = pd.DataFrame(dist_euc, index=df2.index, columns=df2.index)
dist_euc_df

WineName,10 Anos Tawny Porto,10 Year Old Tawny Port,10 Year Old Tawny Porto,101 Moscato d Asti,12 e Mezzo Primitivo del Salento,20 Anos Tawny Porto,20 Year Old Tawny Port,20 Years Old Tawny Porto,21 Gables Chenin Blanc,A Galet Rosado,...,Winemaker s Collection Blanc de Noir,Winemaker s Collection Montepulciano d Abruzzo,Woodthorpe Vineyard Syrah,Zinfandel,Zweigelt,Zweigelt Classic,Zweigeltrebe,Úrágya Tokaji Furmint,Ürziger Würzgarten Riesling Kabinett,Красностоп Золотовский Krasnostop Zolotovskiy
WineName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Anos Tawny Porto,0.000000,92.834530,19.748418,28.600699,85.486841,22.011361,48.523190,24.949950,56.307193,22.710130,...,19.943671,23.318448,26.734809,26.052831,23.895606,21.534855,18.261982,24.989998,32.714676,36.854443
10 Year Old Tawny Port,92.834530,0.000000,92.513513,93.996011,120.117651,93.154442,100.400448,93.240281,104.147732,93.471921,...,92.838570,93.493315,93.872254,94.085068,93.617573,93.193347,92.340403,93.751000,95.281163,97.085014
10 Year Old Tawny Porto,19.748418,92.513513,0.000000,25.980762,84.764379,18.479719,47.106263,21.897488,55.637218,19.306735,...,15.960890,20.018741,23.911294,23.146274,20.099751,17.909495,13.802174,21.943108,30.450780,35.146124
101 Moscato d Asti,28.600699,93.996011,25.980762,0.000000,87.160771,27.739863,51.029403,30.124741,58.510683,28.297526,...,26.129485,28.788018,30.590031,31.044323,29.257478,26.846788,24.869660,30.157918,36.431442,40.388736
12 e Mezzo Primitivo del Salento,85.486841,120.117651,84.764379,87.160771,0.000000,85.202699,94.315958,85.985464,98.673705,85.409309,...,84.695041,85.593516,86.381422,86.520229,85.802098,85.080844,84.501479,86.090069,88.573416,89.488826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zweigelt Classic,21.534855,93.193347,17.909495,26.846788,85.080844,20.377684,48.137823,23.521267,55.733742,21.130547,...,18.124569,21.783021,25.406692,24.688054,22.399777,0.000000,16.255768,23.563743,31.638584,36.180105
Zweigeltrebe,18.261982,92.340403,13.802174,24.869660,84.501479,16.881943,46.765372,20.566964,54.936327,17.783419,...,14.080128,18.553975,22.699119,21.891779,19.274335,16.255768,0.000000,20.615528,29.508473,34.332929
Úrágya Tokaji Furmint,24.989998,93.751000,21.943108,30.157918,86.090069,24.000000,49.779514,26.720778,57.148928,24.642443,...,22.118996,25.204166,26.753504,27.753378,25.739075,23.563743,20.615528,0.000000,34.084454,38.337319
Ürziger Würzgarten Riesling Kabinett,32.714676,95.281163,30.450780,36.431442,88.573416,31.964824,54.071712,33.581989,60.072872,32.449961,...,30.577770,32.878564,35.383612,34.871192,32.330326,31.638584,29.508473,34.084454,0.000000,43.766425


### Testing the Model

In [38]:
def find_similar_wines_combined(wines, count=10):
    recommendation = {}

    wines_cosine = [wine for wine in wines if wine in cosine_sim_df.columns]
    wines_summed_cosine = cosine_sim_df[wines_cosine].apply(lambda row: np.sum(row), axis=1)
    wines_summed_cosine = wines_summed_cosine.sort_values(ascending=False)
    ranked_wines_cosine = wines_summed_cosine.index[wines_summed_cosine.index.isin(wines_cosine) == False]
    ranked_wines_cosine = ranked_wines_cosine.tolist()
    ranked_wines_cosine = ranked_wines_cosine[:count]

    wines_euclidean = [wine for wine in wines if wine in dist_euc_df.columns]
    wines_summed_euclidean = dist_euc_df[wines_euclidean].apply(lambda row: np.sum(row), axis=1)
    wines_summed_euclidean = wines_summed_euclidean.sort_values()
    ranked_wines_euclidean = wines_summed_euclidean.index[wines_summed_euclidean.index.isin(wines_euclidean) == False]
    ranked_wines_euclidean = ranked_wines_euclidean.tolist()
    ranked_wines_euclidean = ranked_wines_euclidean[:count]

    combined_ranking = {}
    for i, wine in enumerate(ranked_wines_cosine):
        combined_ranking[wine] = combined_ranking.get(wine, 0) + i

    for i, wine in enumerate(ranked_wines_euclidean):
        combined_ranking[wine] = combined_ranking.get(wine, 0) + i

    sorted_combined_ranking = sorted(combined_ranking.items(), key=lambda x: x[1])

    top_10_wine_indices = [df.index[df['WineName'] == wine].tolist()[0] for wine, _ in sorted_combined_ranking[:count]]
    top_10_wines = df.loc[top_10_wine_indices, ['WineName']].copy()

    recommendation = {
        'WineName': df['WineName'].iloc[top_10_wine_indices],
    }

    return pd.DataFrame(recommendation)

In [22]:
name = df['WineName'].sample(1).values[0]

In [39]:
display(f'Recommendation for: {name}')
find_similar_wines_combined(['Vecchia Grappa Moscato'], 10)

'Recommendation for: Rouge  Gaston Hochar '

Unnamed: 0,WineName
8897,STORIA Grappa Merlot
1683,Cave Cabernet Sauvignon
76810,Garnacha Rosé
114947,A Rosé Is A Rosé Is A Rosé
149986,Grüner Veltliner Alte Reben
75832,Perlé Nero
76771,Fragolino
8308,Reserva Chardonnay
24787,Trollinger Lemberger Trocken
7279,Colheita Tardia Malvasia Bianca


### Exporting the model

In [32]:
joblib.dump((cosine_sim_df, dist_euc_df, df), '../models/collaborative_recommender_model.pkl', compress=('zlib', 3))

['../models/collaborative_recommender_model.pkl']