In [None]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import pyarrow as pa

In [None]:
PERTINENT_YEAR = 1960

# Read data and clean data

In [None]:
test= pq.read_table('./data/wine_xwine_vivino.parquet').to_pandas()
print(test.shape)

#save to csv
test[['WineID', 'Vintage', 'Review']].to_csv('./data/test_nb_review.csv', index=False)

In [None]:
wine_ratings = pq.read_table('./data/all_wine_xwine_vivino.parquet').to_pandas()
print(wine_ratings.shape)
wine_ratings.head()

In [None]:
#cast Vintage to int
wine_ratings['Vintage'] = wine_ratings['Vintage'].astype(int)
print(wine_ratings.columns)

In [None]:
#keep only where CountRating >= 4 and Vintage >= 1960
pertinent_wine_ratings = wine_ratings[wine_ratings['CountRating'] >=4]
pertinent_wine_ratings = pertinent_wine_ratings[pertinent_wine_ratings['Vintage'] >= PERTINENT_YEAR]
print(pertinent_wine_ratings.shape)

# Add weather features to the table

In [None]:
# Add weather data to the dataframe by joining with ../weather_data/agg_monthly.parquet
weather_data = pd.read_parquet('./data/agg_quarterly.parquet') 

# Take into account only the weather data from 1960
weather_data = weather_data[weather_data['year'] >= PERTINENT_YEAR]

# Take into account only columns: RegionID, year, avg_temperature, avg_sunshine_duration, avg_precipitation, avg_rain, avg_humidity, avg_soil_temperature, avg_soil_moisture
weather_data = weather_data[['RegionID', 'year', 'quarter', 'avg_temperature', 'avg_sunshine_duration', 'avg_precipitation', 'avg_rain', 'avg_humidity', 'avg_soil_temperature', 'avg_soil_moisture']]
# Create a new dataframe with the data for each region and year, and with a group of column for each quarter
for index, row in weather_data.iterrows():
    quarter = int(row['quarter'])
    weather_data.at[index, f'avg_temperature_q{quarter}'] = row['avg_temperature']
    weather_data.at[index, f'avg_sunshine_duration_q{quarter}'] = row['avg_sunshine_duration']
    weather_data.at[index, f'avg_precipitation_q{quarter}'] = row['avg_precipitation']
    weather_data.at[index, f'avg_rain_q{quarter}'] = row['avg_rain']
    weather_data.at[index, f'avg_humidity_q{quarter}'] = row['avg_humidity']
    weather_data.at[index, f'avg_soil_temperature_q{quarter}'] = row['avg_soil_temperature']
    weather_data.at[index, f'avg_soil_moisture_q{quarter}'] = row['avg_soil_moisture']
    
# Drop the columns that are not needed anymore
weather_data = weather_data.drop(['quarter', 'avg_temperature', 'avg_sunshine_duration', 'avg_precipitation', 'avg_rain', 'avg_humidity', 'avg_soil_temperature', 'avg_soil_moisture'], axis=1)

# Group the data by RegionID and year and take the max of each column
weather_data = weather_data.groupby(['RegionID', 'year']).max().reset_index()

In [None]:
# rename the column RegionID to Region
weather_data.head()

In [None]:
print(weather_data.shape)

In [None]:
#Join the weather data with the pertinent_wine_ratings dataframe
pertinent_wine_ratings_with_weather = pertinent_wine_ratings.merge(weather_data, left_on=['RegionID', 'Vintage'], right_on=['RegionID', 'year'])
# Drop the column year
pertinent_wine_ratings_with_weather = pertinent_wine_ratings_with_weather.drop(['year'], axis=1)

print(pertinent_wine_ratings_with_weather.shape)

In [None]:
print(pertinent_wine_ratings_with_weather.columns)

# Create and train a doc2vec model on the colume 'review'

In [None]:
#drop null values in review column
pertinent_ratings_non_null = pertinent_wine_ratings.dropna(subset=['Review'])
print(pertinent_ratings_non_null.shape)

In [None]:
reviews = pertinent_ratings_non_null['Review']
unique_reviews = reviews.unique()

#convert to pandas series
unique_reviews = pd.Series(unique_reviews)
print(len(unique_reviews))

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) | set(stopwords.words('french')) | set(stopwords.words('spanish'))
import string

def clean_review_for_doc2vec(review):
    #Clean the unique reviews series with lower case and remove punctuation
    review = review.lower()
    review = review.translate(str.maketrans('', '', string.punctuation))
    #Remove stop words in each review
    review = review.split()
    review = [word for word in review if not word in stop_words]
    review = ' '.join(review)
    #remove emojis in each review
    review = review.encode('ascii', 'ignore').decode('ascii')
    return review

unique_reviews_no_punctuation_no_stop_word = unique_reviews.apply(clean_review_for_doc2vec)

In [None]:
print(unique_reviews[1])
print(unique_reviews_no_punctuation_no_stop_word[1])

In [None]:
#Find the longest review in unique_reviews
max_len = 0
for x in unique_reviews_no_punctuation_no_stop_word:
    if len(x.split()) > max_len:
        max_len = len(x.split())
print(max_len)

In [None]:
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
from nltk.tokenize import word_tokenize

# preproces the documents, and create TaggedDocuments
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
                              tags=[str(i)]) for i,
               doc in enumerate(unique_reviews_no_punctuation_no_stop_word)]

In [None]:
# train the Doc2vec model
model = Doc2Vec(vector_size=100,
                min_count=5, epochs=50)
model.build_vocab(tagged_data)
model.train(tagged_data,
            total_examples=model.corpus_count,
            epochs=model.epochs)

In [None]:
# save the model
model.save("./data/doc2vec.model")

In [None]:
# load the model
model= Doc2Vec.load("./data/doc2vec.model")

In [None]:
cleaned_reviews = reviews.apply(clean_review_for_doc2vec)
print(len(cleaned_reviews))

In [None]:
# get the document vectors
document_vectors = [model.infer_vector(
    word_tokenize(doc)) for doc in cleaned_reviews]

In [None]:
#convert document vectors to dataframe
document_vectors_df = pd.DataFrame(document_vectors, columns=['doc2vec'+str(i) for i in range(len(document_vectors[0]))])
print(document_vectors_df.shape)

#find the min of each column
print(min(document_vectors_df.min()))
print(max(document_vectors_df.max()))

In [None]:
#get the wineID and review columns from pertinent_wine_ratings
key = pertinent_ratings_non_null[['WineID', 'Vintage']]
print(key.shape)
#concatenate key and document_vectors_df vertically
wine_with_only_text_review = pd.concat([key.reset_index(drop=True),document_vectors_df.reset_index(drop=True)], axis=1)
print(wine_with_only_text_review.shape)


In [None]:
#aggregate the document vectors by wineID and vintage -> average
aggregated_doc_vector = wine_with_only_text_review.groupby(['WineID', 'Vintage']).mean().reset_index()

aggregated_doc_vector.head()

In [None]:
print(aggregated_doc_vector.shape)

# Comparison models

Let's separate the data that can be calculate with a distance from the others

In [None]:
data_to_normalize = pertinent_wine_ratings_with_weather.drop(['RegionID', 'MinRating', 'MaxRating','Type','CountRating', 'Review'], axis=1)
data_to_normalize.dropna(inplace=True)
data_to_normalize.drop_duplicates(inplace=True)
print(data_to_normalize.shape) 

In [None]:
print(data_to_normalize.columns)

### Encode categorical data

In [None]:
#create the dictionary for the categorical variables

acid = data_to_normalize['Acidity'].unique()
print(acid)

body = data_to_normalize['Body'].unique()
print(body)

elaborate = data_to_normalize['Elaborate'].unique()
print(elaborate)

acid_dict = {'Low': 1, 'Medium': 2, 'High': 3}
body_dict = {'Very light-bodied': 1, 'Light-bodied': 2, 'Medium-bodied': 3, 'Full-bodied': 4, 'Very full-bodied': 5}
elaborat_dict = {'Varietal/100%':1, 'Varietal/>75%': 2, 'Assemblage/Blend' : 3,
                 'Assemblage/Meritage Red Blend':4, 'Assemblage/Meritage White Blend': 5, 
                 'Assemblage/Rhône Red Blend':6, 'Assemblage/Bordeaux Red Blend':7 , 
                 'Assemblage/Bourgogne Red Blend': 8, 'Assemblage/Bourgogne White Blend': 9, 'Assemblage/Portuguese White Blend': 10, 
                 'Assemblage/Portuguese Red Blend': 11, 'Assemblage/Port Blend': 12,  
                 'Assemblage/Provence Rosé Blend' :13, 'Assemblage/Champagne Blend': 14, 'Assemblage/Valpolicella Red Blend': 15,
                 'Assemblage/Chianti Red Blend': 16, 'Assemblage/Tuscan Red Blend': 17, 'Assemblage/Rioja Red Blend': 18, 
                 'Assemblage/Rioja White Blend' : 19, 'Assemblage/Priorat Red Blend': 20,
                 'Assemblage/Cava Blend': 21, 'Assemblage/Soave White Blend': 22
                 }

In [None]:
## Label Encoding
data_to_normalize['Body'] = data_to_normalize['Body'].map(body_dict)
data_to_normalize['Acidity'] = data_to_normalize['Acidity'].map(acid_dict)
data_to_normalize['Elaborate'] = data_to_normalize['Elaborate'].map(elaborat_dict)

data_to_normalize.head()

### Normalize the data

In [None]:
# The wine_dataset must contain the following columns: ['WineID','Vintage', 'WineName']. 
# The other columns are the features.
def normalize_wine_data(wine_dataset):
    ## Select Features and Target
    features = wine_dataset.drop(['WineID','Vintage', 'WineName'], axis=1)
    targets = wine_dataset[['WineID','Vintage', 'WineName']]
    ## Normalize Features
    for column in features.columns:
        features[column] = (features[column] - features[column].min()) / (features[column].max() - features[column].min())
    ## Return normalized dataset
    normalized_df = pd.concat([targets.reset_index(drop=True),features.reset_index(drop=True)], axis=1)
    return normalized_df

In [None]:
# Normalize the data
normalized_wine_data = normalize_wine_data(data_to_normalize)
print(normalized_wine_data.shape)
print(normalized_wine_data.head())

In [190]:
normalized_wine_name_data = normalized_wine_data[['WineID', 'Vintage', 'WineName']]
normalized_wine_calculate_data = normalized_wine_data.drop(['WineID', 'Vintage', 'WineName'], axis=1)
print(normalized_wine_calculate_data.shape)

(580596, 33)


### Now let's add Doc2Vec vector

In [191]:
wine_text_review_vec = aggregated_doc_vector.copy()
wine_name_data = wine_text_review_vec[['WineID', 'Vintage']]
wine_vec = wine_text_review_vec.drop(['WineID', 'Vintage'], axis=1)

print(wine_vec.shape)

(451, 100)


# Now, let's compare using KD Tree

In [None]:
wine_id = 102356
vintage = 2016

In [None]:
if ((normalized_wine_data['WineID'] == wine_id) & (normalized_wine_data['Vintage'] == vintage)).any():
    reference_wine_composition_and_weather = normalized_wine_data.loc[(normalized_wine_data['WineID'] == wine_id) & (normalized_wine_data['Vintage'] == vintage)] 
    if ((wine_text_review_vec['WineID'] == wine_id) & (wine_text_review_vec['Vintage'] == vintage)).any():
        print("Wine ID found")
        reference_wine_text_review = wine_text_review_vec.loc[(wine_text_review_vec['WineID'] == wine_id) & (wine_text_review_vec['Vintage'] == vintage)]
    else:
        print("Wine ID found with no text review")
else:
    print("Wine ID not found")

In [None]:
input_wine_composition_and_weather = reference_wine_composition_and_weather.drop(['WineID','Vintage', 'WineName'], axis=1).to_numpy().reshape(1,-1)
print(input_wine_composition_and_weather)

input_wine_text_review = reference_wine_text_review.drop(['WineID','Vintage'], axis=1).to_numpy().reshape(1,-1)
print(input_wine_text_review.shape)

In [None]:
from sklearn.neighbors import KDTree
# Build the KD Tree
wine_composition_weather_tree = KDTree(normalized_wine_calculate_data, metric='euclidean')
wine_text_review_tree = KDTree(wine_vec, metric='euclidean')

In [None]:
# Query the KD Tree
dist_composition_weather, ind_composition_weather = wine_composition_weather_tree.query(input_wine_composition_and_weather, k=len(normalized_wine_calculate_data))
dist_text_review, ind_text_review = wine_text_review_tree.query(input_wine_text_review, k=len(wine_vec))

# Print the results
print(dist_composition_weather.shape)
print(ind_composition_weather)
print(dist_text_review.shape)
print(ind_text_review.shape)

In [None]:
# create a dictionary with ind as key and dist as value
dict_composition_weather_wine = dict(zip(ind_composition_weather[0], dist_composition_weather[0]))
print(dict_composition_weather_wine[53405])

dict_text_review_wine = dict(zip(ind_text_review[0], dist_text_review[0]))

In [None]:
# merge dictionary with normalized_wine_data with the value of the key is the index of the row in normalized_wine_data
# and the value is the distance
normalized_wine_name_data['distance1'] = normalized_wine_data.index.map(dict_composition_weather_wine)
print(normalized_wine_name_data.head())

# find max of distance1
max_distance1 = normalized_wine_name_data['distance1'].max()
print(max_distance1)

In [None]:
wine_name_data['distance2'] = wine_text_review_vec.index.map(dict_text_review_wine)
print(wine_name_data.head())

# find max of distance2
max_distance2 = wine_name_data['distance2'].max()
print(max_distance2)

In [None]:
# Merge the datasets based on 'Name' and 'Age'
merged_wine_data = pd.merge(normalized_wine_name_data, wine_name_data, on=['WineID', 'Vintage'],how='left')

# Fill null values with 0 before adding 'ScoreDay1' and 'ScoreDay2'
merged_wine_data['distance'] = merged_wine_data['distance1'].fillna(0) + merged_wine_data['distance2'].fillna(0)

# Drop the redundant 'ScoreDay1' and 'ScoreDay2' columns if needed
merged_wine_data = merged_wine_data.drop(['distance1', 'distance2'], axis=1)

In [None]:
print(merged_wine_data.shape)
print(merged_wine_data.head())


In [192]:
# Sort the DataFrame by the euclidean_distance column and display the 10 nearest wines
ref_wine_name = normalized_wine_data.loc[(normalized_wine_data['WineID'] == wine_id) & (normalized_wine_data['Vintage'] == vintage)]['WineName'].values[0]
print("reference wine is "+ ref_wine_name + " in year " + str(vintage))
print("The 10 most similar wines are:")

sorted_df = merged_wine_data.sort_values(by=['distance'])
top_10 = sorted_df.head(11)

for index, row in top_10.iterrows():
    print(row['WineName'] + " in year " + str(row['Vintage']) + " with distance " + str(row['distance']))

reference wine is Douro Quinta da Manoella VV Tinto in year 2016
The 10 most similar wines are:
Vinhas Velhas Limited Release Touriga Nacional in year 2018 with distance 0.0
Douro Quinta da Manoella VV Tinto in year 2018 with distance 0.0
Douro in year 2018 with distance 0.0029761904761904656
Reserva Vinhas Velhas in year 2018 with distance 0.004999999999999893
Touriga Nacional Reserva in year 2018 with distance 0.011904761904761862
Reserva in year 2018 with distance 0.014731391274719766
Douro Reserva Red in year 2018 with distance 0.015625
Guyot Tinto in year 2018 with distance 0.017857142857142794
Syrah in year 2018 with distance 0.017857142857142794
Altitude in year 2018 with distance 0.01877891289304159
Reserva Touriga Nacional in year 2018 with distance 0.01877891289304159
