In [1]:
import joblib
import pandas as pd

In [2]:
# CONFIG
WINE_ID = 184656
VINTAGE = 2019

In [3]:
pertinent_wine_ratings = pd.read_parquet(f'data/pertinent_wine_ratings.parquet')
pertinent_ratings_non_null = pd.read_parquet(f'data/pertinent_ratings_non_null.parquet')
aggregated_doc_vector = pd.read_csv(f'data/aggregated_doc_vector.csv')
normalized_wine_data = pd.read_parquet(f'data/normalized_wine_data.parquet')

In [4]:
checked_exist_df = pertinent_wine_ratings[(pertinent_wine_ratings['WineID'] == WINE_ID) &
                                         (pertinent_wine_ratings['Vintage'] == VINTAGE)]
checked_exist_df

Unnamed: 0,WineID,Vintage,RegionID,MinRating,MaxRating,AverageRating,CountRating,WineName,Type,Elaborate,ABV,Body,Acidity,Review
897456,184656,2019,3748,2.5,5.0,3.759259,27,Fistful of Love Spätburgunder Rosé,Rosé,Varietal/100%,11.5,Medium-bodied,High,


In [5]:
checked_review_df = pertinent_ratings_non_null[(pertinent_ratings_non_null['WineID'] == WINE_ID) &
                                                (pertinent_ratings_non_null['Vintage'] == VINTAGE)]
checked_review_df

Unnamed: 0,WineID,Vintage,RegionID,MinRating,MaxRating,AverageRating,CountRating,WineName,Type,Elaborate,ABV,Body,Acidity,Review


In [6]:
wine_text_review_vec = aggregated_doc_vector.copy()
wine_text_review_vec

Unnamed: 0,WineID,Vintage,doc2vec0,doc2vec1,doc2vec2,doc2vec3,doc2vec4,doc2vec5,doc2vec6,doc2vec7,...,doc2vec90,doc2vec91,doc2vec92,doc2vec93,doc2vec94,doc2vec95,doc2vec96,doc2vec97,doc2vec98,doc2vec99
0,102356,2018,0.181159,-0.006130,-0.164011,-0.150916,0.177292,-0.146109,0.199605,0.346257,...,0.371919,0.190709,0.270117,0.079797,0.156480,0.086788,0.022629,-0.143865,-0.143997,-0.150261
1,106708,2003,-0.046031,0.007001,0.182302,-0.208886,-0.107941,-0.386056,-0.135855,0.102471,...,0.512995,0.186848,-0.004358,-0.169124,0.043081,0.084224,0.270940,0.133638,-0.051553,-0.104737
2,106708,2005,-0.015140,-0.017896,0.018712,-0.101483,-0.053692,-0.252250,-0.049264,0.326491,...,0.502280,0.520902,0.110765,0.125864,0.124525,0.089667,0.409296,-0.049737,-0.122226,-0.067335
3,106791,2020,-0.035097,-0.135215,-0.156048,0.056848,0.240830,-0.106365,0.129632,0.320851,...,0.359150,0.147614,0.070216,0.008941,-0.000330,-0.047424,0.202104,0.010392,-0.200048,-0.132026
4,107349,2012,0.075850,-0.094027,-0.153697,-0.129016,0.123982,-0.189865,-0.151056,0.280583,...,0.286978,-0.029751,0.106150,-0.019260,0.186659,0.113496,0.066132,-0.048486,-0.032261,-0.127213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,184812,2016,0.105353,0.114603,-0.068879,-0.275820,0.064449,-0.227751,0.112075,0.215521,...,0.304142,0.311230,-0.001769,0.138936,0.262625,0.025617,0.053793,-0.078375,0.012033,-0.255612
447,186518,2018,0.054457,0.254923,-0.218906,0.057634,0.002584,-0.130813,0.075198,0.219169,...,0.272835,0.179924,0.228031,-0.010529,0.228839,0.169786,0.081230,-0.148209,-0.133244,-0.104150
448,189574,2017,0.041410,-0.066205,-0.035016,-0.017218,0.080182,-0.310808,0.022132,0.371762,...,0.356697,0.149032,0.092841,-0.053855,0.185922,-0.090348,0.129835,-0.013776,-0.101161,-0.142861
449,193535,2020,-0.050885,-0.010638,-0.165054,-0.119412,0.037416,-0.266895,-0.012540,0.304013,...,0.184686,0.025130,0.061286,0.110427,-0.000857,0.020063,0.039312,0.063119,-0.100568,0.076510


In [7]:
reference_wine_composition_and_weather = normalized_wine_data[(normalized_wine_data['WineID'] == WINE_ID) &
                                                                  (normalized_wine_data['Vintage'] == VINTAGE)]

In [8]:
have_text_review = False
if not checked_review_df.empty:
    have_text_review = True
    reference_wine_text_review = wine_text_review_vec[(wine_text_review_vec['WineID'] == WINE_ID) &
                                                        (wine_text_review_vec['Vintage'] == VINTAGE)]

In [9]:
input_wine_composition_and_weather = reference_wine_composition_and_weather.drop(['WineID', 'Vintage', 'WineName'],
                                                                                     axis=1).to_numpy().reshape(1, -1)

In [10]:
if have_text_review:
    input_wine_text_review = reference_wine_text_review.drop(['WineID', 'Vintage'], axis=1).to_numpy().reshape(1, -1)

In [11]:
wine_composition_weather_tree = joblib.load('data/wine_composition_weather_tree.joblib')
wine_text_review_tree = joblib.load('data/wine_text_review_tree.joblib')

In [12]:
dist_composition_weather, ind_composition_weather = wine_composition_weather_tree.query(
    input_wine_composition_and_weather, k=len(normalized_wine_data)
)

if have_text_review:
    dist_text_review, ind_text_review = wine_text_review_tree.query(
        input_wine_text_review, k=len(wine_text_review_vec)
    )

In [13]:
dict_composition_weather_wine = dict(zip(ind_composition_weather[0], dist_composition_weather[0]))

if have_text_review:
    dict_text_review_wine = dict(zip(ind_text_review[0], dist_text_review[0]))

In [14]:
normalized_wine_name_data = normalized_wine_data[['WineID', 'Vintage', 'WineName']]
normalized_wine_name_data['distance_no_text'] = normalized_wine_data.index.map(dict_composition_weather_wine)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normalized_wine_name_data['distance_no_text'] = normalized_wine_data.index.map(dict_composition_weather_wine)


In [15]:
wine_name_data = wine_text_review_vec[['WineID', 'Vintage']]
if have_text_review:
    wine_name_data['distance_text'] = wine_text_review_vec.index.map(dict_text_review_wine)

In [16]:
merged_wine_data = pd.merge(normalized_wine_name_data, wine_name_data, on=['WineID', 'Vintage'], how='left')

if have_text_review:
    # Fill null values with 0 before adding 'ScoreDay1' and 'ScoreDay2'
    merged_wine_data['distance'] = merged_wine_data['distance_no_text'].fillna(0) + merged_wine_data['distance_text'].fillna(0)

    # Drop the redundant 'ScoreDay1' and 'ScoreDay2' columns if needed
    merged_wine_data = merged_wine_data.drop(['distance_no_text', 'distance_text'], axis=1)

In [17]:
if have_text_review:
    distance_column = 'distance'
else:
    distance_column = 'distance_no_text'

In [18]:
nb_wines = 10

In [19]:
sorted_df = merged_wine_data.sort_values(by=[distance_column])

top_wines_plus_one = sorted_df.head(nb_wines + 1)

In [20]:
# Remove the wine itself from the list
if WINE_ID in top_wines_plus_one['WineID'].values and VINTAGE in top_wines_plus_one['Vintage'].values:
    top_wines = top_wines_plus_one[(top_wines_plus_one['WineID'] != WINE_ID) |
                                       (top_wines_plus_one['Vintage'] != VINTAGE)]
else:
    top_wines = top_wines_plus_one.head(nb_wines)

In [21]:
top_wines

Unnamed: 0,WineID,Vintage,WineName,distance_no_text
549314,184656,2020,Fistful of Love Spätburgunder Rosé,0.178116
549311,184656,2017,Fistful of Love Spätburgunder Rosé,0.271675
549312,184656,2018,Fistful of Love Spätburgunder Rosé,0.276938
83648,107237,2019,Weißer Burgunder,0.280627
83624,106848,2019,Grauer Burgunder vom Kalkstein Trocken,0.280633
83690,108914,2019,Weißburgunder Trocken,0.280734
83673,108077,2019,Grauburgunder Trocken,0.281253
83670,107991,2019,Grauburgunder S,0.281904
83640,107039,2019,Weissburgunder Trocken,0.282198
83667,107808,2019,Grauer Burgunder Trocken,0.282549


In [22]:
wine_df = pd.read_csv('data/wines.csv')
wine_df

Unnamed: 0,WineID,WineName,Type,Elaborate,ABV,Body,Acidity,RegionID,WineryID
0,100001,Espumante Moscatel,Sparkling,Varietal/100%,7.5,Medium-bodied,High,1001,10001
1,100002,Ancellotta,Red,Varietal/100%,12.0,Medium-bodied,Medium,1001,10001
2,100003,Cabernet Sauvignon,Red,Varietal/100%,12.0,Full-bodied,High,1001,10002
3,100004,Virtus Moscato,White,Varietal/100%,12.0,Medium-bodied,Medium,1001,10003
4,100005,Maison de Ville Cabernet-Merlot,Red,Assemblage/Bordeaux Red Blend,11.0,Full-bodied,Medium,1001,10000
...,...,...,...,...,...,...,...,...,...
100641,200791,Rulandské Šedé Výběr z Hroznů,White,Varietal/100%,13.0,Medium-bodied,Medium,2295,67056
100642,200792,Top Collection Merlot Pozdní Sběr,Dessert,Varietal/100%,13.5,Medium-bodied,High,2295,66978
100643,200793,Falter Ego Gelber Muskateller,White,Varietal/100%,12.5,Medium-bodied,High,2086,62547
100644,200794,Reisberg Riesling,White,Varietal/100%,12.5,Medium-bodied,High,2093,62795


In [23]:
region_df = pd.read_parquet('data/regions.parquet')
region_df

Unnamed: 0,RegionID,RegionName,Country,Code,Latitude,Longitude
0,2534,Sennoy (Сенной),Russia,RU,45.288974,36.997299
1,2529,Krasnodar (Краснодарский),Russia,RU,45.036035,38.974571
2,2536,Taman Peninsula (Таманский полуостров),Russia,RU,45.185556,36.791111
3,2528,Gelendzhik (Геленджик),Russia,RU,44.562035,38.090457
4,2525,Anapa (Анапа),Russia,RU,44.887967,37.299109
...,...,...,...,...,...,...
2155,2473,Utrecht,Netherlands,NL,52.090737,5.121420
2156,2470,Gelderland,Netherlands,NL,52.045155,5.871823
2157,2474,Wognum,Netherlands,NL,52.681841,5.022845
2158,2471,Limburg,Netherlands,NL,51.442724,6.060873


In [24]:
# Join the wine_df and region_df
wine_region_df = pd.merge(wine_df, region_df, on='RegionID')
wine_region_df

Unnamed: 0,WineID,WineName,Type,Elaborate,ABV,Body,Acidity,RegionID,WineryID,RegionName,Country,Code,Latitude,Longitude
0,100001,Espumante Moscatel,Sparkling,Varietal/100%,7.5,Medium-bodied,High,1001,10001,Serra Gaúcha,Brazil,BR,-14.235004,-51.925280
1,100002,Ancellotta,Red,Varietal/100%,12.0,Medium-bodied,Medium,1001,10001,Serra Gaúcha,Brazil,BR,-14.235004,-51.925280
2,100003,Cabernet Sauvignon,Red,Varietal/100%,12.0,Full-bodied,High,1001,10002,Serra Gaúcha,Brazil,BR,-14.235004,-51.925280
3,100004,Virtus Moscato,White,Varietal/100%,12.0,Medium-bodied,Medium,1001,10003,Serra Gaúcha,Brazil,BR,-14.235004,-51.925280
4,100005,Maison de Ville Cabernet-Merlot,Red,Assemblage/Bordeaux Red Blend,11.0,Full-bodied,Medium,1001,10000,Serra Gaúcha,Brazil,BR,-14.235004,-51.925280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100641,200665,Karydies Assyrtiko,White,Varietal/100%,11.0,Medium-bodied,High,2335,67589,Dodecanese,Greece,GR,36.337620,27.174706
100642,200668,Cabernet Sauvignon Rosé,Rosé,Varietal/100%,13.0,Full-bodied,Medium,2329,67533,Chania,Greece,GR,35.513830,24.018037
100643,200674,Vranec Barrique,Red,Varietal/100%,13.8,Full-bodied,Medium,2452,68598,Ohrid,North Macedonia,MK,41.123098,20.801648
100644,200759,Reserve Petit Verdot,Red,Varietal/100%,13.8,Full-bodied,High,2424,68356,Sharon,Israel,IL,31.046051,34.851612


In [25]:
rating_wine_df = pd.merge(top_wines, wine_region_df, on=['WineID'])
rating_wine_df

Unnamed: 0,WineID,Vintage,WineName_x,distance_no_text,WineName_y,Type,Elaborate,ABV,Body,Acidity,RegionID,WineryID,RegionName,Country,Code,Latitude,Longitude
0,184656,2020,Fistful of Love Spätburgunder Rosé,0.178116,Fistful of Love Spätburgunder Rosé,Rosé,Varietal/100%,11.5,Medium-bodied,High,3748,59651,Gothenburg,Sweden,SE,57.70887,11.97456
1,184656,2017,Fistful of Love Spätburgunder Rosé,0.271675,Fistful of Love Spätburgunder Rosé,Rosé,Varietal/100%,11.5,Medium-bodied,High,3748,59651,Gothenburg,Sweden,SE,57.70887,11.97456
2,184656,2018,Fistful of Love Spätburgunder Rosé,0.276938,Fistful of Love Spätburgunder Rosé,Rosé,Varietal/100%,11.5,Medium-bodied,High,3748,59651,Gothenburg,Sweden,SE,57.70887,11.97456
3,107237,2019,Weißer Burgunder,0.280627,Weißer Burgunder,White,Varietal/100%,12.5,Medium-bodied,High,1100,14400,Nahe,Germany,DE,53.793639,10.135674
4,106848,2019,Grauer Burgunder vom Kalkstein Trocken,0.280633,Grauer Burgunder vom Kalkstein Trocken,White,Varietal/100%,12.5,Medium-bodied,High,1100,14392,Nahe,Germany,DE,53.793639,10.135674
5,108914,2019,Weißburgunder Trocken,0.280734,Weißburgunder Trocken,White,Varietal/100%,12.5,Medium-bodied,High,1100,14423,Nahe,Germany,DE,53.793639,10.135674
6,108077,2019,Grauburgunder Trocken,0.281253,Grauburgunder Trocken,White,Varietal/100%,12.0,Medium-bodied,High,1100,14409,Nahe,Germany,DE,53.793639,10.135674
7,107991,2019,Grauburgunder S,0.281904,Grauburgunder S,White,Varietal/100%,13.0,Medium-bodied,High,1100,14409,Nahe,Germany,DE,53.793639,10.135674
8,107039,2019,Weissburgunder Trocken,0.282198,Weissburgunder Trocken,White,Varietal/100%,12.0,Medium-bodied,High,1100,14409,Nahe,Germany,DE,53.793639,10.135674
9,107808,2019,Grauer Burgunder Trocken,0.282549,Grauer Burgunder Trocken,White,Varietal/100%,12.5,Medium-bodied,High,1100,14436,Nahe,Germany,DE,53.793639,10.135674


In [27]:
# Select some columns
rating_wine_df = rating_wine_df[['WineID', 'Vintage', 'distance_no_text', 'WineName_x', 'RegionName', 'Country']]

In [28]:
# Rename the columns
rating_wine_df = rating_wine_df.rename(columns={'WineName_x': 'WineName'})

In [29]:
new_rating_wine_df = pd.merge(rating_wine_df, pertinent_wine_ratings, on=['WineID', 'Vintage'])
new_rating_wine_df

Unnamed: 0,WineID,Vintage,distance_no_text,WineName_x,RegionName,Country,RegionID,MinRating,MaxRating,AverageRating,CountRating,WineName_y,Type,Elaborate,ABV,Body,Acidity,Review
0,184656,2020,0.178116,Fistful of Love Spätburgunder Rosé,Gothenburg,Sweden,3748,2.5,4.5,3.583333,12,Fistful of Love Spätburgunder Rosé,Rosé,Varietal/100%,11.5,Medium-bodied,High,
1,184656,2017,0.271675,Fistful of Love Spätburgunder Rosé,Gothenburg,Sweden,3748,3.5,4.5,3.928571,7,Fistful of Love Spätburgunder Rosé,Rosé,Varietal/100%,11.5,Medium-bodied,High,
2,184656,2018,0.276938,Fistful of Love Spätburgunder Rosé,Gothenburg,Sweden,3748,1.5,4.0,3.45,10,Fistful of Love Spätburgunder Rosé,Rosé,Varietal/100%,11.5,Medium-bodied,High,
3,107237,2019,0.280627,Weißer Burgunder,Nahe,Germany,1100,2.5,4.5,3.785714,14,Weißer Burgunder,White,Varietal/100%,12.5,Medium-bodied,High,
4,106848,2019,0.280633,Grauer Burgunder vom Kalkstein Trocken,Nahe,Germany,1100,2.0,5.0,3.731707,82,Grauer Burgunder vom Kalkstein Trocken,White,Varietal/100%,12.5,Medium-bodied,High,
5,108914,2019,0.280734,Weißburgunder Trocken,Nahe,Germany,1100,3.5,4.5,3.8,5,Weißburgunder Trocken,White,Varietal/100%,12.5,Medium-bodied,High,
6,108077,2019,0.281253,Grauburgunder Trocken,Nahe,Germany,1100,3.0,5.0,3.866667,15,Grauburgunder Trocken,White,Varietal/100%,12.0,Medium-bodied,High,
7,107991,2019,0.281904,Grauburgunder S,Nahe,Germany,1100,3.0,4.0,3.7,5,Grauburgunder S,White,Varietal/100%,13.0,Medium-bodied,High,
8,107039,2019,0.282198,Weissburgunder Trocken,Nahe,Germany,1100,2.0,5.0,3.617647,17,Weissburgunder Trocken,White,Varietal/100%,12.0,Medium-bodied,High,
9,107808,2019,0.282549,Grauer Burgunder Trocken,Nahe,Germany,1100,1.0,5.0,3.625,16,Grauer Burgunder Trocken,White,Varietal/100%,12.5,Medium-bodied,High,


In [31]:
# Select some columns
new_rating_wine_df = new_rating_wine_df[['WineID', 'Vintage', 'distance_no_text', 'WineName_x', 'Type', 'Elaborate',
                                         'ABV', 'Body', 'Acidity', 'MinRating', 'MaxRating', 'AverageRating',
                                            'RegionName', 'Country']]

In [32]:
# Rename the columns
new_rating_wine_df = new_rating_wine_df.rename(columns={'WineName_x': 'WineName'})

In [33]:
new_rating_wine_df.to_csv(f'data/compare_wine_{WINE_ID}.csv', index=False)