In [35]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.utils import to_categorical, plot_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np
import cufflinks as cf; cf.go_offline()
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler

In [4]:
#%pip install pydot

In [2]:
wine_reviews = pd.read_csv("data/wine_reviews.csv", index_col=0)
wine_reviews.shape

(119924, 8)

In [3]:
wine_reviews.columns

Index(['country', 'description', 'points', 'price', 'province', 'taster_name',
       'variety', 'year'],
      dtype='object')

In [3]:
wine_reviews.variety.fillna("NA", inplace=True)
wine_reviews.taster_name.fillna("NA", inplace=True)
wine_reviews.year.fillna(wine_reviews.year.min(),inplace=True)
wine_reviews.price.fillna(wine_reviews.price.median(), inplace=True)

In [37]:
wine_reviews.sample(5)

Unnamed: 0,country,description,points,price,province,taster_name,variety,year
115411,US,Fruity and forward with ripe blackberry and ch...,88,25.0,New York,Anna Lee C. Iijima,Red Blend,2010.0
109678,US,Menthol and pepper ride over dark cherry and s...,85,20.0,Virginia,Alexander Peartree,Bordeaux-style Red Blend,2013.0
53765,Germany,Sharp acidity offsets juicy white peach and ap...,90,19.0,Pfalz,Anna Lee C. Iijima,Riesling,2015.0
31885,US,"Concannon's Reserve Petite swirls in plum, dar...",87,40.0,California,Virginie Boone,Petite Sirah,2008.0
107120,Italy,A blend of 60% Sangiovese and 40% Cabernet Sau...,89,65.0,Tuscany,Kerin O’Keefe,Red Blend,2009.0


In [4]:
wine_reviews[wine_reviews.year.isna()]

Unnamed: 0,country,description,points,price,province,taster_name,variety,year


In [5]:
text_col = ['description']
price_col_name = 'price'
numerical_col = [price_col_name]
categorical_col = ['country','province','taster_name','variety','year']

X = wine_reviews[['description','price','country','province','taster_name','variety', 'year']].copy()
y = wine_reviews['points'].copy()

In [6]:
# Convert categorical features to one-hot encoding
for col in categorical_col:
    X[col] = pd.Categorical(X[col])
    X[col] = X[col].cat.codes

In [68]:
X.sample(5)

Unnamed: 0,description,price,country,province,taster_name,variety,year
102242,"New leather, wild berry, violet, white pepper ...",58.0,22,286,9,382,71
127785,Pungent aromas of field greens and citrus lead...,19.0,15,195,14,511,60
51628,From a very good if not great Champagne vintag...,139.0,15,75,16,121,67
16233,Aromas include dry apricot and papaya with a h...,18.0,37,132,12,7,71
76913,Produced in partnership between Schulz Cellars...,18.0,40,51,19,223,72


In [9]:
X.year.min()

1821.0

In [73]:
X[X.year.isnull()]

Unnamed: 0,description,price,country,province,taster_name,variety,year


In [7]:
X['price'] = np.log(X['price'])
scaler = StandardScaler()
X[['price']] = scaler.fit_transform(X[['price']].values)

In [22]:
X.sample(5)

Unnamed: 0,description,price,country,province,taster_name,variety,year
11208,Very terroir-driven from chalk soil in this gr...,2.384253,15,47,16,437,2011.0
26805,"Tangy and showing some good aging, this is a r...",-0.05216,31,294,16,446,1821.0
121534,"A new wine for Aegerter, this is ripe and full...",-0.838306,15,183,16,125,2013.0
121029,"Toffee, exotic spice, vanilla, raspberry and c...",0.73493,40,411,17,696,2012.0
43754,"This wine is dense, tough and dark—a black plu...",-0.05216,31,294,16,446,2010.0


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
country_len = wine_reviews.country.nunique()
province_len = wine_reviews.province.nunique()
tester_len = wine_reviews.taster_name.nunique()
variety_len = wine_reviews.variety.nunique()
year_len  = wine_reviews.year.nunique()

### Model without textual columns

In [9]:
print(f'country:{country_len}  province {province_len}  tester {tester_len}  variety {variety_len}  year {year_len}')

country:43  province 425  tester 20  variety 701  year 78


In [43]:
results_df = pd.DataFrame(columns=['name','classifier','alpha','reduction','train_r2','train_mse', 'test_r2', 'test_mse'])

def append_to_results(name, classifier, alpha, reduction, y_test, y_predict, y_train, y_train_predicted):
    test_r2 = r2_score(y_test, y_predicted), 
    test_mse = mean_squared_error(y_test, y_predicted)
    train_r2 = r2_score(y_train, y_train_predicted)
    train_mse = mean_squared_error(y_train, y_train_predicted)
    
    return results_df.append({'name':name, 'classifier':classifier, 'alpha': alpha, 'reduction':reduction, 'test_r2': test_r2, 'test_mse':test_mse, 'train_r2': train_r2, "train_mse": train_mse}, ignore_index=True)

def append_row_to_results(name, classifier, alpha, reduction, test_r2, test_mse, train_r2, train_mse):
    return results_df.append({'name':name, 'classifier':classifier, 'alpha': alpha, 'reduction':reduction, 'test_r2': test_r2, 'test_mse':test_mse, 'train_r2': train_r2, "train_mse": train_mse}, ignore_index=True)


## Model With Text

In [14]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization # after TensorFlow 2.6
from tensorflow.keras import layers

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
X_train.sample(5)

Unnamed: 0,description,price,country,province,taster_name,variety,year
28171,This falls sbetween a rosé and an orange wine ...,0.034663,40,268,15,433,74
25907,"Tight and structured, this wine has minerality...",-0.659546,15,47,16,125,74
86353,Prominent yet not overpowering oak smoke aroma...,-0.143399,40,51,11,125,72
87206,This wine is made to be enjoyed young and frui...,-1.170851,31,108,16,447,73
91790,Just too raisiny and stewed for real satisfact...,-0.417344,40,51,14,325,64


### Try to concatenate Categories as OneHotEncoder and text as Embeddings

In [None]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, 
                             output_dim=128, 
                             embeddings_initializer="uniform", 
                             input_length=max_length, 
                             name="embedding_1") 

embedding

<keras.layers.core.embedding.Embedding at 0x1f0968fa130>

In [36]:
categorical_col = ['country','province','taster_name','variety','year']

ct = ColumnTransformer([
    ('categorical', OneHotEncoder(handle_unknown="ignore"), categorical_col)
], remainder='passthrough')

In [15]:
max_vocab_length = 10000 
max_length = 50 # 

text_vectorizer = layers.TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [62]:
X_test_categories = X_test.drop('description', axis=1)
X_train_categories = X_train.drop('description', axis=1)
X_test_cats = ct.transform(X_test_categories)
X_train_cats = ct.transform(X_train_categories)

X_test_cats = X_test_cats.toarray()
X_train_cats = X_train_cats.toarray()

In [63]:
X_test_cats.shape, X_train_cats.shape

((23985, 1220), (95939, 1220))

In [27]:
# Define the graph model
# Categorical inputs
# country

In [136]:
#text
text_input = Input(shape=(1,), dtype="string")
text_vector = text_vectorizer(text_input)
text_embedding = embedding(text_vector)
#text_flatten = Flatten()(text_embedding)
text_embedding = layers.LSTM(64)(text_embedding)

cat_input = Input(shape=(1220,), dtype="float32")
# Numerical inputs
#num_input = Input(shape=(1,))

# Concatenate categorical embeddings with numerical inputs
concatenated = Concatenate()([text_embedding, cat_input])
#concatenated = Concatenate()([flat_embed_country, flat_embed_province, flat_embed_tester, flat_embed_variety, flat_embed_year, num_input])


# Dense layers for classification
x = Dense(16, activation='relu')(concatenated)
x = Dense(8, activation='relu')(x)
outputs = Dense(1, activation="linear")(x)

# Create the model
#model = Model(inputs=[cat_input_country, cat_input_province, cat_input_tester, cat_input_variety, text_input, num_input], outputs=outputs)
#model = Model(inputs=[cat_input_country, cat_input_province, cat_input_tester, cat_input_variety, cat_input_year, num_input], outputs=outputs)

# Create the model
model2 = tf.keras.Model(inputs=[text_input, cat_input], outputs = outputs)

In [138]:
model2.compile(loss=tf.keras.losses.mse,
                          optimizer=tf.keras.optimizers.Adam(),
                          metrics=['mse'])

In [None]:
train_X_data = [np.array(X_train['description']), X_train_cats]
#train_X_data = [X_train['country'], X_train['province'], X_train['taster_name'], X_train['variety'], X_train[[price_col_name]]]

#model.compile(optimizer=Adam(learning_rate=0.01),loss='mean_squared_error',metrics=[Accuracy()])

# Train the model
model2.fit([np.array(X_train['description']), X_train_cats], y_train,
          batch_size=64,
          epochs=10)

# Evaluate the model
loss, accuracy = model2.evaluate(train_X_data, y_train)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)


## Try Veriety + Price model

In [102]:
#variety
cat_input_variety = Input(shape=(1,))
embedding_variety = Embedding(input_dim=X.variety.nunique(), output_dim=8)(cat_input_variety)
flat_embed_variety = Flatten()(embedding_variety)

# Numerical inputs
num_input = Input(shape=(1,))

# Concatenate categorical embeddings with numerical inputs
concatenated = Concatenate()([flat_embed_variety, num_input])

# Dense layers for classification
x = Dense(32, activation='tanh')(concatenated)
x = Dense(16, activation='tanh')(x)
outputs = Dense(1, activation="linear")(x)

# Create the model
model = Model(inputs=[cat_input_variety, num_input], outputs=outputs)
model.summary()


Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_43 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 embedding_18 (Embedding)       (None, 1, 8)         5608        ['input_43[0][0]']               
                                                                                                  
 flatten_19 (Flatten)           (None, 8)            0           ['embedding_18[0][0]']           
                                                                                                  
 input_44 (InputLayer)          [(None, 1)]          0           []                               
                                                                                            

In [81]:
data = [X_train['variety'], X_train['price']]
data

[125300    437
 110511    610
 87721     557
 19638     581
 8950      387
          ... 
 118734    125
 129926    557
 111067    214
 860        71
 15929     610
 Name: variety, Length: 95939, dtype: int16,
 125300    0.671731
 110511    4.397961
 87721    -0.659546
 19638    -1.433326
 8950     -0.344252
             ...   
 118734   -0.143399
 129926    0.633869
 111067   -1.583078
 860       0.780133
 15929     0.780133
 Name: price, Length: 95939, dtype: float64]

In [104]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping_monitor = EarlyStopping(
    monitor='val_mean_squared_error',
    min_delta=0.01,
    patience=3,
    verbose=0,
    mode='min',
    baseline=None,
    restore_best_weights=True
)

In [103]:
X_train_small = X_train.head(100000) 

model.compile(optimizer=Adam(),
              loss='mean_squared_error',
              metrics=[Accuracy()])
data = [X_train_small['variety'],  X_train_small[numerical_col]]
# Train the model
model.fit( data, y_train.head(100000),
          batch_size=16,
          epochs=10,
          callbacks=[early_stopping_monitor])

# Evaluate the model
loss, accuracy = model.evaluate(data, y_train.head(100000))
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 5.674920558929443
Test Accuracy: 0.0


In [105]:
y_predict = model.predict([X_test['variety'],  X_test[numerical_col]])



In [106]:
print(f'r2 score: {r2_score(y_test, y_predict)}')
print(f'mse score:{mean_squared_error(y_test, y_predict)}')

r2 score: 0.3896874386426289
mse score:5.838069889308666


## Try Country + tester + Veriety + Price

In [123]:
#taster_name
cat_input_tester = Input(shape=(1,))
embedding_tester = Embedding(input_dim=tester_len, output_dim=5)(cat_input_tester)
flat_embed_tester = Flatten()(embedding_tester)

#country
cat_input_country = Input(shape=(1,))
embedding_country = Embedding(input_dim=country_len, output_dim=7)(cat_input_country)
flat_embed_country = Flatten()(embedding_country)

#variety
cat_input_variety = Input(shape=(1,))
embedding_variety = Embedding(input_dim=X.variety.nunique(), output_dim=8)(cat_input_variety)
flat_embed_variety = Flatten()(embedding_variety)

# Numerical inputs
num_input = Input(shape=(1,))

# Concatenate categorical embeddings with numerical inputs
concatenated = Concatenate()([flat_embed_tester, flat_embed_country, flat_embed_variety, num_input])

# Dense layers for classification
x = Dense(32, activation='selu')(concatenated)
x = Dense(16, activation='tanh')(x)
outputs = Dense(1, activation="linear")(x)

# Create the model
model = Model(inputs=[cat_input_tester, cat_input_country, cat_input_variety, num_input], outputs=outputs)
model.summary()

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_66 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_67 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_68 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 embedding_35 (Embedding)       (None, 1, 5)         100         ['input_66[0][0]']               
                                                                                           

In [124]:

model.compile(optimizer=Adam(),
              loss='mean_squared_error',
              metrics=[Accuracy()])
data = [X_train['taster_name'], X_train['country'], X_train['variety'],  X_train[numerical_col]]
# Train the model
model.fit( data, y_train,
          batch_size=32,
          epochs=10,
          callbacks=[early_stopping_monitor])

# Evaluate the model
loss, accuracy = model.evaluate(data, y_train.head(100000))
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 5.168735027313232
Test Accuracy: 0.0


In [125]:
y_predict = model.predict([X_test['taster_name'], X_test['country'], X_test['variety'],  X_test[numerical_col]])
print(f'r2 score: {r2_score(y_test, y_predict)}')
print(f'mse score:{mean_squared_error(y_test, y_predict)}')

r2 score: 0.4403039317981632
mse score:5.353887450172057


## All Categories and Price

In [128]:
#province
cat_input_province = Input(shape=(1,))
embedding_province = Embedding(input_dim=province_len, output_dim=22)(cat_input_province)
flat_embed_province = Flatten()(embedding_province)

#year
cat_input_year = Input(shape=(1,))
embedding_year = Embedding(input_dim=year_len, output_dim=9)(cat_input_year)
flat_embed_year = Flatten()(embedding_year)

#taster_name
cat_input_tester = Input(shape=(1,))
embedding_tester = Embedding(input_dim=tester_len, output_dim=5)(cat_input_tester)
flat_embed_tester = Flatten()(embedding_tester)

#country
cat_input_country = Input(shape=(1,))
embedding_country = Embedding(input_dim=country_len, output_dim=7)(cat_input_country)
flat_embed_country = Flatten()(embedding_country)

#variety
cat_input_variety = Input(shape=(1,))
embedding_variety = Embedding(input_dim=X.variety.nunique(), output_dim=8)(cat_input_variety)
flat_embed_variety = Flatten()(embedding_variety)

# Numerical inputs
num_input = Input(shape=(1,))

# Concatenate categorical embeddings with numerical inputs
concatenated = Concatenate()([flat_embed_province, flat_embed_year, flat_embed_tester, flat_embed_country, flat_embed_variety, num_input])

# Dense layers for classification
x = Dense(64, activation='tanh')(concatenated)
x = Dense(32, activation='selu')(x)
outputs = Dense(1, activation="linear")(x)

# Create the model
model = Model(inputs=[cat_input_province, cat_input_year, cat_input_tester, cat_input_country, cat_input_variety, num_input], outputs=outputs)
model.summary()

Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_76 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_77 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_78 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_79 (InputLayer)          [(None, 1)]          0           []                               
                                                                                           

In [129]:
data = [X_train['province'],  X_train['year'],X_train['taster_name'], X_train['country'], X_train['variety'], X_train[['price']]]

model.compile(optimizer=Adam(),
              loss='mean_squared_error',
              metrics=[Accuracy()])

# Train the model
model.fit(data, y_train,
          batch_size=16,
          epochs=10,
          callbacks=[early_stopping_monitor])

# Evaluate the model
loss, accuracy = model.evaluate(data, y_train)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 4.889308452606201
Test Accuracy: 0.0


In [130]:
y_predict = model.predict([X_test['province'],  X_test['year'], X_test['taster_name'], X_test['country'], X_test['variety'],  X_test[numerical_col]])
print(f'r2 score: {r2_score(y_test, y_predict)}')
print(f'mse score:{mean_squared_error(y_test, y_predict)}')

r2 score: 0.45841018762273733
mse score:5.180688349202432


In [135]:
predicted_df1 = pd.DataFrame()
predicted_df1["true"] = y_test
predicted_df1["predicted"] = y_predict
predicted_df1['error'] = abs(predicted_df1["true"] - predicted_df1['predicted'])
predicted_df1.sort_values(by="error", ascending=False).head(10)


Unnamed: 0,true,predicted,error
16521,81,91.914276,10.914276
40928,83,93.628624,10.628624
109528,80,89.952042,9.952042
111758,99,89.216034,9.783966
116094,99,89.264656,9.735344
48895,98,88.606903,9.393097
56177,81,90.134567,9.134567
14733,81,90.134018,9.134018
111750,80,89.121658,9.121658
69200,94,84.882668,9.117332
