In [None]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import pyarrow as pa

In [None]:
original_wine_ratings = pq.read_table('./data/join_wine_agg_rating.parquet').to_pandas()
print(original_wine_ratings.shape)
original_wine_ratings.head()

In [None]:
wine_ratings = pq.read_table('./data/join_wine_agg_rating.parquet').to_pandas()

In [None]:
file = "./data/winemag-data_first150k.csv"
data = pd.read_csv(file)

In [None]:
wine_ratings.loc[wine_ratings['WineName'] == "Sauvignon Blanc"]

In [None]:
#count number of values = 100001 in colume 'WineID'
len(wine_ratings[wine_ratings['WineID'] == 100001])

In [None]:
# keep only the wines that have at least 5 ratings
pertinent_wine_ratings = wine_ratings[wine_ratings['CountRating'] > 4 ]
print(pertinent_wine_ratings.shape)
pertinent_wine_ratings.head()

In [None]:
## Label Encoding

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

pertinent_wine_ratings.loc[:, 'Type'] = le.fit_transform(pertinent_wine_ratings['Type'])+1
pertinent_wine_ratings.loc[:, 'Body'] = le.fit_transform(pertinent_wine_ratings['Body'])+1
pertinent_wine_ratings.loc[:, 'Acidity'] = le.fit_transform(pertinent_wine_ratings['Acidity'])+1
pertinent_wine_ratings.loc[:, 'Elaborate'] = le.fit_transform(pertinent_wine_ratings['Elaborate'])+1

pertinent_wine_ratings.head()

In [None]:
## Select Features and Target
X = pertinent_wine_ratings.drop(['WineID','Vintage','WineName', 'RegionID'], axis=1)
Y = pertinent_wine_ratings[['WineID','Vintage']]

In [None]:
X.head()

In [None]:
Y.head()

In [None]:
## Normalize Features
for column in X.columns:
    X[column] = (X[column] - X[column].min()) / (X[column].max() - X[column].min())

In [None]:
X.head()

In [None]:
## Concatenate Features and Target for the comparison
X = pd.concat([Y,X], axis=1)

In [None]:
X.head()

In [None]:
wine_id = 100001
vintage = '2014'


if ((X['WineID'] == wine_id) & (X['Vintage'] == vintage)).any():
    print("Wine ID found")
    #fine row with index wine_id in X
    #given_row_X = X.loc[X['WineID']== float(wine_id)] 
    
    #find row with index wine_id and vintage_id in pertinent_ratings
    reference_row = X.loc[(X['WineID'] == wine_id) & (X['Vintage'] == vintage)]
    print(reference_row)
    
    # found_wine = True
else:
    print("Wine ID not found")

print(type(X['Vintage'].iloc[0]))

if ((X['Vintage'] == vintage)).any():
    print("Vintage found")


In [None]:
# Calculate the Euclidean distance between the reference wine and each of the other wines

from scipy.spatial import distance

def euclidean_distance(row):
    
    wine = [float(row['Type']), float(row['ABV']), float(row['Body']), float(row['Acidity']), float(row['Elaborate']), float(row['AverageRating'])]
    reference_wine = [float(reference_row['Type'].values[0]), float(reference_row['ABV'].values[0]), float(reference_row['Body'].values[0]), 
                      float(reference_row['Acidity'].values[0]), float(reference_row['Elaborate'].values[0]), float(reference_row['AverageRating'].values[0])]

    return distance.euclidean(wine, reference_wine)


X['euclidean_distance'] = X.apply(euclidean_distance, axis=1)

# Display the DataFrame with the calculated distances
print(X.head())

In [None]:
# Sort the DataFrame by the euclidean_distance column and display the 10 nearest wines
ref_wine_name = pertinent_wine_ratings.loc[(pertinent_wine_ratings['WineID'] == wine_id) & (pertinent_wine_ratings['Vintage'] == vintage)]['WineName'].values[0]
print("reference wine is "+ ref_wine_name + " in year " + vintage)
print("The 10 most similar wines are:")

Z = X.sort_values(by=['euclidean_distance'])
Z = Z.head(11)

for index, row in Z.iterrows():
    r = pertinent_wine_ratings.loc[(pertinent_wine_ratings['WineID'] == row['WineID']) & (pertinent_wine_ratings['Vintage'] == row['Vintage'])]
    print(r['WineName'].values[0] + " in year " + r['Vintage'].values[0])