In [1]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import pyarrow as pa

In [100]:
# Hyper-Parameters
PERTINENT_YEAR = 1960
MIN_NBCOUNT_TO_BE_PERTINENT = 4
WEIGTH_COMPOSITION_VS_WEATHER = 1
NUMBER_OF_WINES_TO_RECOMMEND = 10

# Read data and clean data

In [3]:
test= pq.read_table('./data/wine_xwine_vivino.parquet').to_pandas()
print(test.shape)

#save to csv
test[['WineID', 'Vintage', 'Review']].to_csv('./data/test_nb_review.csv', index=False)

(10859, 16)


In [4]:
wine_ratings = pq.read_table('./data/all_wine_xwine_vivino.parquet').to_pandas()
print(wine_ratings.shape)
wine_ratings.head()

(1015470, 14)


Unnamed: 0,WineID,Vintage,RegionID,MinRating,MaxRating,AverageRating,CountRating,WineName,Type,Elaborate,ABV,Body,Acidity,Review
0,100001,1988,1001,3.0,5.0,4.0,2,Espumante Moscatel,Sparkling,Varietal/100%,7.5,Medium-bodied,High,
1,100001,1999,1001,4.0,5.0,4.5,2,Espumante Moscatel,Sparkling,Varietal/100%,7.5,Medium-bodied,High,
2,100001,2007,1001,2.5,2.5,2.5,1,Espumante Moscatel,Sparkling,Varietal/100%,7.5,Medium-bodied,High,
3,100001,2008,1001,1.0,5.0,3.630952,42,Espumante Moscatel,Sparkling,Varietal/100%,7.5,Medium-bodied,High,
4,100001,2009,1001,4.0,4.0,4.0,1,Espumante Moscatel,Sparkling,Varietal/100%,7.5,Medium-bodied,High,


In [5]:
#cast Vintage to int
wine_ratings['Vintage'] = wine_ratings['Vintage'].astype(int)
print(wine_ratings.columns)

Index(['WineID', 'Vintage', 'RegionID', 'MinRating', 'MaxRating',
       'AverageRating', 'CountRating', 'WineName', 'Type', 'Elaborate', 'ABV',
       'Body', 'Acidity', 'Review'],
      dtype='object')


In [6]:
#keep only where CountRating >= 4 and Vintage >= 1960
pertinent_wine_ratings = wine_ratings[wine_ratings['CountRating'] >= MIN_NBCOUNT_TO_BE_PERTINENT]
pertinent_wine_ratings = pertinent_wine_ratings[pertinent_wine_ratings['Vintage'] >= PERTINENT_YEAR]
print(pertinent_wine_ratings.shape)

(590946, 14)


# Add weather features to the table

In [7]:
# Add weather data to the dataframe by joining with ../weather_data/agg_monthly.parquet
weather_data = pd.read_parquet('./data/agg_quarterly.parquet') 

# Take into account only the weather data from 1960
weather_data = weather_data[weather_data['year'] >= PERTINENT_YEAR]

# Take into account only columns: RegionID, year, avg_temperature, avg_sunshine_duration, avg_precipitation, avg_rain, avg_humidity, avg_soil_temperature, avg_soil_moisture
weather_data = weather_data[['RegionID', 'year', 'quarter', 'avg_temperature', 'avg_sunshine_duration', 'avg_precipitation', 'avg_rain', 'avg_humidity', 'avg_soil_temperature', 'avg_soil_moisture']]
# Create a new dataframe with the data for each region and year, and with a group of column for each quarter
for index, row in weather_data.iterrows():
    quarter = int(row['quarter'])
    weather_data.at[index, f'avg_temperature_q{quarter}'] = row['avg_temperature']
    weather_data.at[index, f'avg_sunshine_duration_q{quarter}'] = row['avg_sunshine_duration']
    weather_data.at[index, f'avg_precipitation_q{quarter}'] = row['avg_precipitation']
    weather_data.at[index, f'avg_rain_q{quarter}'] = row['avg_rain']
    weather_data.at[index, f'avg_humidity_q{quarter}'] = row['avg_humidity']
    weather_data.at[index, f'avg_soil_temperature_q{quarter}'] = row['avg_soil_temperature']
    weather_data.at[index, f'avg_soil_moisture_q{quarter}'] = row['avg_soil_moisture']
    
# Drop the columns that are not needed anymore
weather_data = weather_data.drop(['quarter', 'avg_temperature', 'avg_sunshine_duration', 'avg_precipitation', 'avg_rain', 'avg_humidity', 'avg_soil_temperature', 'avg_soil_moisture'], axis=1)

# Group the data by RegionID and year and take the max of each column
weather_data = weather_data.groupby(['RegionID', 'year']).max().reset_index()

In [8]:
# rename the column RegionID to Region
weather_data.head()

Unnamed: 0,RegionID,year,avg_temperature_q1,avg_sunshine_duration_q1,avg_precipitation_q1,avg_rain_q1,avg_humidity_q1,avg_soil_temperature_q1,avg_soil_moisture_q1,avg_temperature_q2,...,avg_humidity_q3,avg_soil_temperature_q3,avg_soil_moisture_q3,avg_temperature_q4,avg_sunshine_duration_q4,avg_precipitation_q4,avg_rain_q4,avg_humidity_q4,avg_soil_temperature_q4,avg_soil_moisture_q4
0,1000,1960,20.543956,39951.309011,3.235165,3.235165,82.087912,20.421978,0.46338,13.495604,...,81.035326,13.525861,0.474124,18.98587,34659.49087,4.454348,4.454348,79.113678,17.784466,0.441081
1,1000,1961,21.013333,34062.769889,4.664444,4.664444,82.59213,20.753333,0.424137,15.089011,...,81.089674,15.644928,0.416095,19.41087,35479.610978,6.15,6.15,81.992301,18.734375,0.472089
2,1000,1962,20.167778,37664.319778,2.975556,2.975556,78.656944,20.257315,0.399439,13.474725,...,83.466938,13.249366,0.432844,17.871739,38923.577935,3.016304,3.016304,75.85462,16.668614,0.43059
3,1000,1963,20.976667,32839.231111,6.416667,6.416667,83.199537,20.654398,0.447973,14.848352,...,81.668931,15.242165,0.480356,18.468478,31610.565109,7.048913,7.048913,82.24683,17.791033,0.473822
4,1000,1964,20.557143,38296.825604,2.035165,2.035165,76.513736,20.588782,0.378099,13.912088,...,85.091938,13.004801,0.472649,16.822826,38146.66,2.823913,2.823913,78.979167,16.364629,0.449564


In [9]:
print(weather_data.shape)

(135702, 30)


In [10]:
#Join the weather data with the pertinent_wine_ratings dataframe
pertinent_wine_ratings_with_weather = pertinent_wine_ratings.merge(weather_data, left_on=['RegionID', 'Vintage'], right_on=['RegionID', 'year'])
# Drop the column year
pertinent_wine_ratings_with_weather = pertinent_wine_ratings_with_weather.drop(['year'], axis=1)

print(pertinent_wine_ratings_with_weather.shape)

(589126, 42)


In [11]:
print(pertinent_wine_ratings_with_weather.columns)

Index(['WineID', 'Vintage', 'RegionID', 'MinRating', 'MaxRating',
       'AverageRating', 'CountRating', 'WineName', 'Type', 'Elaborate', 'ABV',
       'Body', 'Acidity', 'Review', 'avg_temperature_q1',
       'avg_sunshine_duration_q1', 'avg_precipitation_q1', 'avg_rain_q1',
       'avg_humidity_q1', 'avg_soil_temperature_q1', 'avg_soil_moisture_q1',
       'avg_temperature_q2', 'avg_sunshine_duration_q2',
       'avg_precipitation_q2', 'avg_rain_q2', 'avg_humidity_q2',
       'avg_soil_temperature_q2', 'avg_soil_moisture_q2', 'avg_temperature_q3',
       'avg_sunshine_duration_q3', 'avg_precipitation_q3', 'avg_rain_q3',
       'avg_humidity_q3', 'avg_soil_temperature_q3', 'avg_soil_moisture_q3',
       'avg_temperature_q4', 'avg_sunshine_duration_q4',
       'avg_precipitation_q4', 'avg_rain_q4', 'avg_humidity_q4',
       'avg_soil_temperature_q4', 'avg_soil_moisture_q4'],
      dtype='object')


# Create and train a doc2vec model on the colume 'review'

In [12]:
#drop null values in review column
pertinent_ratings_non_null = pertinent_wine_ratings.dropna(subset=['Review'])
print(pertinent_ratings_non_null.shape)

(9019, 14)


In [13]:
reviews = pertinent_ratings_non_null['Review']
unique_reviews = reviews.unique()

#convert to pandas series
unique_reviews = pd.Series(unique_reviews)
print(len(unique_reviews))

8699


In [14]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) | set(stopwords.words('french')) | set(stopwords.words('spanish'))
import string

def clean_review_for_doc2vec(review):
    #Clean the unique reviews series with lower case and remove punctuation
    review = review.lower()
    review = review.translate(str.maketrans('', '', string.punctuation))
    #Remove stop words in each review
    review = review.split()
    review = [word for word in review if not word in stop_words]
    review = ' '.join(review)
    #remove emojis in each review
    review = review.encode('ascii', 'ignore').decode('ascii')
    return review

unique_reviews_no_punctuation_no_stop_word = unique_reviews.apply(clean_review_for_doc2vec)

In [15]:
print(unique_reviews[1])
print(unique_reviews_no_punctuation_no_stop_word[1])

Qta. Manoella’s flagship cuvée; a field blend of 30+ varieties; 120+yo vines; 20m in 50% new oak. Tinta Francisca is unusually a high proportion of the blend.Deep ruby. Fantastic fragrance w. tobacco; leather & stylish fruit of black cherry & layered blackberry w. herbal notes. Developing w. loganberry & mulberry.Dry. V. rich & pure w. cherry; raspberry; loganberry; blueberry & black cherry. Generous m+ fine tannins; m+ acid; high alc; discreet oak & a huge finish. Outstanding wine w. power & elegance. 
qta manoellas flagship cuve field blend 30 varieties 120yo vines 20m 50 new oak tinta francisca unusually high proportion blenddeep ruby fantastic fragrance w tobacco leather stylish fruit black cherry layered blackberry w herbal notes developing w loganberry mulberrydry v rich pure w cherry raspberry loganberry blueberry black cherry generous fine tannins acid high alc discreet oak huge finish outstanding wine w power elegance


In [16]:
#Find the longest review in unique_reviews
max_len = 0
for x in unique_reviews_no_punctuation_no_stop_word:
    if len(x.split()) > max_len:
        max_len = len(x.split())
print(max_len)

69


In [17]:
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
from nltk.tokenize import word_tokenize

# preproces the documents, and create TaggedDocuments
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
                              tags=[str(i)]) for i,
               doc in enumerate(unique_reviews_no_punctuation_no_stop_word)]

  "class": algorithms.Blowfish,


In [18]:
# train the Doc2vec model
model = Doc2Vec(vector_size=100,
                min_count=5, epochs=50)
model.build_vocab(tagged_data)
model.train(tagged_data,
            total_examples=model.corpus_count,
            epochs=model.epochs)

In [19]:
# save the model
model.save("./data/doc2vec.model")

In [20]:
# load the model
model= Doc2Vec.load("./data/doc2vec.model")

In [21]:
cleaned_reviews = reviews.apply(clean_review_for_doc2vec)
print(len(cleaned_reviews))

9019


In [22]:
# get the document vectors
document_vectors = [model.infer_vector(
    word_tokenize(doc)) for doc in cleaned_reviews]

In [23]:
#convert document vectors to dataframe
document_vectors_df = pd.DataFrame(document_vectors, columns=['doc2vec'+str(i) for i in range(len(document_vectors[0]))])
print(document_vectors_df.shape)

#find the min of each column
print(min(document_vectors_df.min()))
print(max(document_vectors_df.max()))

(9019, 100)
-2.999178886413574
2.788770914077759


In [24]:
#get the wineID and review columns from pertinent_wine_ratings
key = pertinent_ratings_non_null[['WineID', 'Vintage']]
print(key.shape)
#concatenate key and document_vectors_df vertically
wine_with_only_text_review = pd.concat([key.reset_index(drop=True),document_vectors_df.reset_index(drop=True)], axis=1)
print(wine_with_only_text_review.shape)


(9019, 2)
(9019, 102)


In [25]:
#aggregate the document vectors by wineID and vintage -> average
aggregated_doc_vector = wine_with_only_text_review.groupby(['WineID', 'Vintage']).mean().reset_index()

aggregated_doc_vector.head()

Unnamed: 0,WineID,Vintage,doc2vec0,doc2vec1,doc2vec2,doc2vec3,doc2vec4,doc2vec5,doc2vec6,doc2vec7,...,doc2vec90,doc2vec91,doc2vec92,doc2vec93,doc2vec94,doc2vec95,doc2vec96,doc2vec97,doc2vec98,doc2vec99
0,102356,2018,0.29022,-0.011338,-0.189252,-0.05782,0.153639,-0.122259,0.11335,0.38834,...,0.331812,0.201766,0.190081,0.071151,0.321292,0.101718,-0.149797,-0.185488,-0.119952,-0.275387
1,106708,2003,-0.089383,0.054382,0.062755,-0.179166,0.044109,-0.455499,-0.143945,0.204784,...,0.4961,0.151388,0.166101,-0.242069,0.141854,0.108114,0.255348,0.294401,-0.062213,-0.166894
2,106708,2005,-0.033086,-0.065051,0.019677,-0.084368,0.150466,-0.355369,-0.059719,0.38141,...,0.421397,0.141658,0.148582,-0.049243,0.229048,0.177663,0.361918,0.166308,-0.115154,-0.123542
3,106791,2020,-0.011586,-0.204684,-0.081152,0.020905,0.067649,-0.161948,0.049679,0.308426,...,0.422729,0.132614,0.181151,-0.063518,-0.040888,0.092357,0.174423,0.052632,-0.240256,-0.030956
4,107349,2012,0.185639,-0.085105,-0.110855,-0.1194,-0.046668,-0.165694,-0.122176,0.299707,...,0.218933,0.043578,0.156835,0.005912,0.009032,0.091719,-0.022649,-0.12033,0.051879,-0.147983


In [26]:
print(aggregated_doc_vector.shape)

(451, 102)


# Comparison models

Let's separate the data that can be calculate with a distance from the others

In [27]:
data_to_normalize = pertinent_wine_ratings_with_weather.drop(['RegionID', 'MinRating', 'MaxRating','Type','CountRating', 'Review'], axis=1)
data_to_normalize.dropna(inplace=True)
data_to_normalize.drop_duplicates(inplace=True)
print(data_to_normalize.shape) 

(580596, 36)


In [28]:
print(data_to_normalize.columns)

Index(['WineID', 'Vintage', 'AverageRating', 'WineName', 'Elaborate', 'ABV',
       'Body', 'Acidity', 'avg_temperature_q1', 'avg_sunshine_duration_q1',
       'avg_precipitation_q1', 'avg_rain_q1', 'avg_humidity_q1',
       'avg_soil_temperature_q1', 'avg_soil_moisture_q1', 'avg_temperature_q2',
       'avg_sunshine_duration_q2', 'avg_precipitation_q2', 'avg_rain_q2',
       'avg_humidity_q2', 'avg_soil_temperature_q2', 'avg_soil_moisture_q2',
       'avg_temperature_q3', 'avg_sunshine_duration_q3',
       'avg_precipitation_q3', 'avg_rain_q3', 'avg_humidity_q3',
       'avg_soil_temperature_q3', 'avg_soil_moisture_q3', 'avg_temperature_q4',
       'avg_sunshine_duration_q4', 'avg_precipitation_q4', 'avg_rain_q4',
       'avg_humidity_q4', 'avg_soil_temperature_q4', 'avg_soil_moisture_q4'],
      dtype='object')


### Encode categorical data

In [29]:
#create the dictionary for the categorical variables

acid = data_to_normalize['Acidity'].unique()
print(acid)

body = data_to_normalize['Body'].unique()
print(body)

elaborate = data_to_normalize['Elaborate'].unique()
print(elaborate)

acid_dict = {'Low': 1, 'Medium': 2, 'High': 3}
body_dict = {'Very light-bodied': 1, 'Light-bodied': 2, 'Medium-bodied': 3, 'Full-bodied': 4, 'Very full-bodied': 5}
elaborat_dict = {'Varietal/100%':1, 'Varietal/>75%': 2, 'Assemblage/Blend' : 3,
                 'Assemblage/Meritage Red Blend':4, 'Assemblage/Meritage White Blend': 5, 
                 'Assemblage/Rhône Red Blend':6, 'Assemblage/Bordeaux Red Blend':7 , 
                 'Assemblage/Bourgogne Red Blend': 8, 'Assemblage/Bourgogne White Blend': 9, 'Assemblage/Portuguese White Blend': 10, 
                 'Assemblage/Portuguese Red Blend': 11, 'Assemblage/Port Blend': 12,  
                 'Assemblage/Provence Rosé Blend' :13, 'Assemblage/Champagne Blend': 14, 'Assemblage/Valpolicella Red Blend': 15,
                 'Assemblage/Chianti Red Blend': 16, 'Assemblage/Tuscan Red Blend': 17, 'Assemblage/Rioja Red Blend': 18, 
                 'Assemblage/Rioja White Blend' : 19, 'Assemblage/Priorat Red Blend': 20,
                 'Assemblage/Cava Blend': 21, 'Assemblage/Soave White Blend': 22
                 }

['High' 'Medium' 'Low']
['Medium-bodied' 'Full-bodied' 'Light-bodied' 'Very light-bodied'
 'Very full-bodied']
['Varietal/100%' 'Assemblage/Bordeaux Red Blend' 'Assemblage/Blend'
 'Assemblage/Portuguese Red Blend' 'Assemblage/Portuguese White Blend'
 'Varietal/>75%' 'Assemblage/Port Blend' 'Assemblage/Champagne Blend'
 'Assemblage/Rhône Red Blend' 'Assemblage/Tuscan Red Blend'
 'Assemblage/Chianti Red Blend' 'Assemblage/Valpolicella Red Blend'
 'Assemblage/Bourgogne Red Blend' 'Assemblage/Meritage Red Blend'
 'Assemblage/Provence Rosé Blend' 'Assemblage/Soave White Blend'
 'Assemblage/Rioja Red Blend' 'Assemblage/Rioja White Blend'
 'Assemblage/Cava Blend' 'Assemblage/Priorat Red Blend'
 'Assemblage/Bourgogne White Blend' 'Assemblage/Meritage White Blend']


In [30]:
## Label Encoding
data_to_normalize['Body'] = data_to_normalize['Body'].map(body_dict)
data_to_normalize['Acidity'] = data_to_normalize['Acidity'].map(acid_dict)
data_to_normalize['Elaborate'] = data_to_normalize['Elaborate'].map(elaborat_dict)

data_to_normalize.head()

Unnamed: 0,WineID,Vintage,AverageRating,WineName,Elaborate,ABV,Body,Acidity,avg_temperature_q1,avg_sunshine_duration_q1,...,avg_humidity_q3,avg_soil_temperature_q3,avg_soil_moisture_q3,avg_temperature_q4,avg_sunshine_duration_q4,avg_precipitation_q4,avg_rain_q4,avg_humidity_q4,avg_soil_temperature_q4,avg_soil_moisture_q4
0,100001,2008,3.630952,Espumante Moscatel,1,7.5,3,3,25.573626,30360.556484,...,37.162138,27.622464,0.343988,27.552174,35326.731848,5.734783,5.734783,70.893116,28.655389,0.375222
1,100005,2008,2.946429,Maison de Ville Cabernet-Merlot,7,11.0,4,2,25.573626,30360.556484,...,37.162138,27.622464,0.343988,27.552174,35326.731848,5.734783,5.734783,70.893116,28.655389,0.375222
2,100006,2008,3.378788,Reserva Cabernet Sauvignon,1,12.5,4,3,25.573626,30360.556484,...,37.162138,27.622464,0.343988,27.552174,35326.731848,5.734783,5.734783,70.893116,28.655389,0.375222
3,100016,2008,2.713415,Acquasantiera Tinto Seco Fino,3,11.0,3,1,25.573626,30360.556484,...,37.162138,27.622464,0.343988,27.552174,35326.731848,5.734783,5.734783,70.893116,28.655389,0.375222
4,100025,2008,3.5,Espumante Método Charmat Brut,3,12.0,3,3,25.573626,30360.556484,...,37.162138,27.622464,0.343988,27.552174,35326.731848,5.734783,5.734783,70.893116,28.655389,0.375222


### Normalize the data

In [31]:
# The wine_dataset must contain the following columns: ['WineID','Vintage', 'WineName']. 
# The other columns are the features.
def normalize_wine_data(wine_dataset):
    ## Select Features and Target
    features = wine_dataset.drop(['WineID','Vintage', 'WineName'], axis=1)
    targets = wine_dataset[['WineID','Vintage', 'WineName']]
    ## Normalize Features
    for column in features.columns:
        features[column] = (features[column] - features[column].min()) / (features[column].max() - features[column].min())
    ## Return normalized dataset
    normalized_df = pd.concat([targets.reset_index(drop=True),features.reset_index(drop=True)], axis=1)
    return normalized_df

In [32]:
# Normalize the data
normalized_wine_data = normalize_wine_data(data_to_normalize)
print(normalized_wine_data.shape)
print(normalized_wine_data.head())

(580596, 36)
   WineID  Vintage                         WineName  AverageRating  Elaborate  \
0  100001     2008               Espumante Moscatel       0.657738   0.000000   
1  100005     2008  Maison de Ville Cabernet-Merlot       0.486607   0.285714   
2  100006     2008       Reserva Cabernet Sauvignon       0.594697   0.000000   
3  100016     2008    Acquasantiera Tinto Seco Fino       0.428354   0.095238   
4  100025     2008    Espumante Método Charmat Brut       0.625000   0.095238   

        ABV  Body  Acidity  avg_temperature_q1  avg_sunshine_duration_q1  ...  \
0  0.156250  0.50      1.0            0.895089                  0.564763  ...   
1  0.229167  0.75      0.5            0.895089                  0.564763  ...   
2  0.260417  0.75      1.0            0.895089                  0.564763  ...   
3  0.229167  0.50      0.0            0.895089                  0.564763  ...   
4  0.250000  0.50      1.0            0.895089                  0.564763  ...   

   avg_humidi

In [33]:
normalized_wine_name_data = normalized_wine_data[['WineID', 'Vintage', 'WineName']]
normalized_wine_calculate_data = normalized_wine_data.drop(['WineID', 'Vintage', 'WineName'], axis=1)
print(normalized_wine_calculate_data.shape)

(580596, 33)


In [101]:
normalized_wine_calculate_data[['Elaborate', 'ABV', 'Body', 'Acidity']] = normalized_wine_calculate_data[['Elaborate', 'ABV', 'Body', 'Acidity']] * WEIGTH_COMPOSITION_VS_WEATHER

### Now let's add Doc2Vec vector

In [34]:
wine_text_review_vec = aggregated_doc_vector.copy()
wine_name_data = wine_text_review_vec[['WineID', 'Vintage']]
wine_vec = wine_text_review_vec.drop(['WineID', 'Vintage'], axis=1)

print(wine_vec.shape)

(451, 100)


# Now, let's compare using KD Tree

In [85]:
wine_id = 142614
vintage = 2017

In [86]:
have_text_review = False
if ((normalized_wine_data['WineID'] == wine_id) & (normalized_wine_data['Vintage'] == vintage)).any():
    reference_wine_composition_and_weather = normalized_wine_data.loc[(normalized_wine_data['WineID'] == wine_id) & (normalized_wine_data['Vintage'] == vintage)] 
    if ((wine_text_review_vec['WineID'] == wine_id) & (wine_text_review_vec['Vintage'] == vintage)).any():
        print("Wine ID found wirh text review")
        have_text_review = True
        reference_wine_text_review = wine_text_review_vec.loc[(wine_text_review_vec['WineID'] == wine_id) & (wine_text_review_vec['Vintage'] == vintage)]
    else:
        print("Wine ID found with no text review")
else:
    print("Wine ID not found")

Wine ID found with no text review


In [87]:
input_wine_composition_and_weather = reference_wine_composition_and_weather.drop(['WineID','Vintage', 'WineName'], axis=1).to_numpy().reshape(1,-1)
print(input_wine_composition_and_weather)

if have_text_review:
    input_wine_text_review = reference_wine_text_review.drop(['WineID','Vintage'], axis=1).to_numpy().reshape(1,-1)
    print(input_wine_text_review.shape)

[[0.5945122  0.66666667 0.25       0.75       1.         0.54405385
  0.40569251 0.09012778 0.08766184 0.81232023 0.32473132 0.68001344
  0.59010407 0.81412296 0.24153938 0.26840835 0.67183994 0.53969611
  0.52174345 0.67435485 0.83549295 0.17830114 0.17830114 0.69440304
  0.63617212 0.51273096 0.55646113 0.39139931 0.12128472 0.1163311
  0.85341554 0.4286231  0.62489747]]


In [88]:
from sklearn.neighbors import KDTree
# Build the KD Tree
wine_composition_weather_tree = KDTree(normalized_wine_calculate_data, metric='euclidean')

if have_text_review:
    wine_text_review_tree = KDTree(wine_vec, metric='euclidean')

In [89]:
# Query the KD Tree
dist_composition_weather, ind_composition_weather = wine_composition_weather_tree.query(input_wine_composition_and_weather, k=len(normalized_wine_calculate_data))

if have_text_review:
    dist_text_review, ind_text_review = wine_text_review_tree.query(input_wine_text_review, k=len(wine_vec))

# Print the results
print(dist_composition_weather.shape)
print(ind_composition_weather)

if have_text_review:
    print(dist_text_review.shape)
    print(ind_text_review.shape)

(1, 580596)
[[370483 296005 384335 ... 492925 492926 496281]]


In [90]:
# create a dictionary with ind as key and dist as value
dict_composition_weather_wine = dict(zip(ind_composition_weather[0], dist_composition_weather[0]))
print(dict_composition_weather_wine[53405])

if have_text_review:
    dict_text_review_wine = dict(zip(ind_text_review[0], dist_text_review[0]))

1.2056353469036445


In [91]:
# merge dictionary with normalized_wine_data with the value of the key is the index of the row in normalized_wine_data
# and the value is the distance
normalized_wine_name_data['distance1'] = normalized_wine_data.index.map(dict_composition_weather_wine)
print(normalized_wine_name_data.head())

# find max of distance1
max_distance1 = normalized_wine_name_data['distance1'].max()
print(max_distance1)

   WineID  Vintage                         WineName  distance1
0  100001     2008               Espumante Moscatel   1.722410
1  100005     2008  Maison de Ville Cabernet-Merlot   1.689428
2  100006     2008       Reserva Cabernet Sauvignon   1.700446
3  100016     2008    Acquasantiera Tinto Seco Fino   1.965715
4  100025     2008    Espumante Método Charmat Brut   1.684317
2.476296459913074


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normalized_wine_name_data['distance1'] = normalized_wine_data.index.map(dict_composition_weather_wine)


In [92]:
if have_text_review:
    wine_name_data['distance2'] = wine_text_review_vec.index.map(dict_text_review_wine)
    print(wine_name_data.head())

    # find max of distance2
    max_distance2 = wine_name_data['distance2'].max()
    print(max_distance2)

In [93]:
# Merge the datasets based on 'Name' and 'Age'
merged_wine_data = pd.merge(normalized_wine_name_data, wine_name_data, on=['WineID', 'Vintage'],how='left')

if have_text_review:
    # Fill null values with 0 before adding 'ScoreDay1' and 'ScoreDay2'
    merged_wine_data['distance'] = merged_wine_data['distance1'].fillna(0) + merged_wine_data['distance2'].fillna(0)

    # Drop the redundant 'ScoreDay1' and 'ScoreDay2' columns if needed
    merged_wine_data = merged_wine_data.drop(['distance1', 'distance2'], axis=1)

In [94]:
print(merged_wine_data.shape)
print(merged_wine_data.head())

(580596, 4)
   WineID  Vintage                         WineName  distance1
0  100001     2008               Espumante Moscatel   1.722410
1  100005     2008  Maison de Ville Cabernet-Merlot   1.689428
2  100006     2008       Reserva Cabernet Sauvignon   1.700446
3  100016     2008    Acquasantiera Tinto Seco Fino   1.965715
4  100025     2008    Espumante Método Charmat Brut   1.684317


In [103]:
# Sort the DataFrame by the euclidean_distance column and display the 10 nearest wines
ref_wine_name = normalized_wine_data.loc[(normalized_wine_data['WineID'] == wine_id) & (normalized_wine_data['Vintage'] == vintage)]['WineName'].values[0]
print("reference wine is "+ ref_wine_name + " in year " + str(vintage))
print("The 10 most similar wines are:")

if have_text_review:
    distance_column = 'distance'
else:
    distance_column = 'distance1'

sorted_df = merged_wine_data.sort_values(by=[distance_column])

top_10 = sorted_df.head(NUMBER_OF_WINES_TO_RECOMMEND+1)

for index, row in top_10.iterrows():
    print(row['WineName'] + " in year " + str(row['Vintage']) +" with ID " + str(row['WineID']) +" with distance " + str(row[distance_column]))

reference wine is Bardolino Classico in year 2017
The 10 most similar wines are:
Bardolino Classico in year 2017 with ID 142614 with distance 0.0
Bardolino Classico in year 2015 with ID 142614 with distance 0.25147150330818696
Bardolino Classico Il Torcolo in year 2011 with ID 148473 with distance 0.26373627238310493
Essere Bardolino in year 2011 with ID 142680 with distance 0.2637490941709859
Bardolino in year 2011 with ID 140692 with distance 0.26445598156897204
Frescaripa Bardolino Classico in year 2011 with ID 136901 with distance 0.26464826669248975
Bardolino Classico in year 2011 with ID 141128 with distance 0.2654915190272815
Bardolino Classico in year 2011 with ID 139109 with distance 0.265499043501791
Bardolino in year 2011 with ID 141941 with distance 0.26567088863038285
Bardolino in year 2011 with ID 143728 with distance 0.26567088863038285
Bardolino Classico in year 2011 with ID 137919 with distance 0.2656957914982054
