In [1]:
import pandas as pd
import numpy as np
import math

from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# load data

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
#Drop 'id', 'twitter_handle','description', 'designation', 'title' since unnecessary for  the prediction process.
train_df = train_df.drop(['id', 'taster_twitter_handle', 'taster_name','title','region_2'], axis = 1)
test_df = test_df.drop(['id', 'index','taster_twitter_handle', 'taster_name', 'title', 'region_2'], axis = 1) 

train_df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,variety,winery
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,88.870874,20.0,Alentejano,,PORTUGUESE RED,J. Portugal Ramos
1,France,"A solid, chunky wine, with a structure that is...",,88.041695,28.0,Bordeaux,Lalande de Pomerol,BORDEAUX-STYLE RED BLEND,Château Tour Grand Colombier
2,France,"This is powerful and concentrated, with the hi...",,94.085021,130.0,Bordeaux,Saint-Émilion,BORDEAUX-STYLE RED BLEND,Château Figeac
3,US,"Rich, ripe and oaky, this Petite Sirah charms ...",Thompson Vineyard,89.869797,34.0,California,Santa Barbara County,PETITE SIRAH,Jaffurs
4,US,This wine is a unique in the state blend and f...,McKinley Springs Vineyard,89.017651,24.0,Washington,Horse Heaven Hills,ROSé,Syncline


#  check the missing values inside our data

In [10]:
train_df.isna().sum()

country           47
description        0
designation    52266
points             0
price              0
province          47
region_1       28534
variety            1
winery             0
dtype: int64

# Fill the missing values with the most seen in the data

In [12]:
train_df.country.fillna('US', axis = 0, inplace =True)
train_df.province.fillna('California', axis = 0, inplace =True)
train_df.region_1.fillna('Unknow', axis = 0, inplace =True) 
train_df.variety.fillna('PINOT NOIR ', axis = 0, inplace =True)
train_df.designation.fillna('Unknow ', axis = 0, inplace =True)

In [13]:
train_df.isnull().sum()

country        0
description    0
designation    0
points         0
price          0
province       0
region_1       0
variety        0
winery         0
dtype: int64

In [14]:
test_df.isnull().sum()

country           17
description        0
designation    24824
points             0
price          83210
province          17
region_1       13883
variety            0
winery             0
dtype: int64

In [16]:
test_df.country.fillna('US', axis = 0, inplace =True)
test_df.province.fillna('California', axis = 0, inplace =True)
test_df.region_1.fillna('Unknow', axis = 0, inplace =True) 
#train_df.region_2.fillna('Central Coast', axis = 0, inplace =True)
#train_df.winery.fillna('Williams Selyem ', axis = 0, inplace =True)
test_df.variety.fillna('PINOT NOIR ', axis = 0, inplace =True)
test_df.designation.fillna('Unknow ', axis = 0, inplace =True)
#train_df.title.fillna('Gloria Ferrer NV Sonoma Brut Sparkling (Sonoma County)', axis = 0, inplace =True)

In [17]:
test_df.isnull().sum()

country            0
description        0
designation        0
points             0
price          83210
province           0
region_1           0
variety            0
winery             0
dtype: int64

# Use lableEncoder for the main part of the data processing

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
le = LabelEncoder()

In [20]:
train_df.dropna(inplace = True)

In [21]:
train_df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,variety,winery
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,88.870874,20.0,Alentejano,Unknow,PORTUGUESE RED,J. Portugal Ramos
1,France,"A solid, chunky wine, with a structure that is...",Unknow,88.041695,28.0,Bordeaux,Lalande de Pomerol,BORDEAUX-STYLE RED BLEND,Château Tour Grand Colombier
2,France,"This is powerful and concentrated, with the hi...",Unknow,94.085021,130.0,Bordeaux,Saint-Émilion,BORDEAUX-STYLE RED BLEND,Château Figeac
3,US,"Rich, ripe and oaky, this Petite Sirah charms ...",Thompson Vineyard,89.869797,34.0,California,Santa Barbara County,PETITE SIRAH,Jaffurs
4,US,This wine is a unique in the state blend and f...,McKinley Springs Vineyard,89.017651,24.0,Washington,Horse Heaven Hills,ROSé,Syncline


In [22]:
le_1 = LabelEncoder()
le_1.fit(train_df["country"].values.tolist()+ test_df["country"].values.tolist())
train_df['countrylabel'] = le_1.transform(train_df["country"].values)
test_df['countrylabel'] = le_1.transform(test_df["country"].values)

le_2 = LabelEncoder()
le_2.fit(train_df["designation"].values.tolist()+ test_df["designation"].values.tolist())
train_df['designationlabel'] = le_2.transform(train_df["designation"].values)
test_df['designationlabel'] = le_2.transform(test_df["designation"].values)

le_3 = LabelEncoder()
le_3.fit(train_df["description"].values.tolist()+ test_df["description"].values.tolist())
train_df['descriptionlabel'] = le_3.transform(train_df["description"].values)
test_df['descriptionlabel'] = le_3.transform(test_df["description"].values)

le_4 = LabelEncoder()
le_4.fit(train_df["province"].values.tolist()+ test_df["province"].values.tolist())
train_df['provincelabel'] = le_4.transform(train_df["province"].values)
test_df['provincelabel'] = le_4.transform(test_df["province"].values)

le_5 = LabelEncoder()
le_5.fit(train_df["region_1"].values.tolist()+ test_df["region_1"].values.tolist())
train_df['region_1label'] = le_5.transform(train_df["region_1"].values)
test_df['region_1label'] = le_5.transform(test_df["region_1"].values)

le_8 = LabelEncoder()
le_8.fit(train_df["variety"].values.tolist()+ test_df["variety"].values.tolist())
train_df['varietylabel'] = le_8.transform(train_df["variety"].values)
test_df['varietylabel'] = le_8.transform(test_df["variety"].values)

le_9 = LabelEncoder()
le_9.fit(train_df["winery"].values.tolist()+ test_df["winery"].values.tolist())
train_df['winerylabel'] = le_9.transform(train_df["winery"].values)
test_df['winerylabel'] = le_9.transform(test_df["winery"].values)


In [23]:
train_df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,variety,winery,countrylabel,designationlabel,descriptionlabel,provincelabel,region_1label,varietylabel,winerylabel
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,88.870874,20.0,Alentejano,Unknow,PORTUGUESE RED,J. Portugal Ramos,34,42042,124840,8,1155,480,9470
1,France,"A solid, chunky wine, with a structure that is...",Unknow,88.041695,28.0,Bordeaux,Lalande de Pomerol,BORDEAUX-STYLE RED BLEND,Château Tour Grand Colombier,15,40935,13510,38,582,62,4345
2,France,"This is powerful and concentrated, with the hi...",Unknow,94.085021,130.0,Bordeaux,Saint-Émilion,BORDEAUX-STYLE RED BLEND,Château Figeac,15,40935,131784,38,983,62,3564
3,US,"Rich, ripe and oaky, this Petite Sirah charms ...",Thompson Vineyard,89.869797,34.0,California,Santa Barbara County,PETITE SIRAH,Jaffurs,44,39632,86081,53,1014,443,9532
4,US,This wine is a unique in the state blend and f...,McKinley Springs Vineyard,89.017651,24.0,Washington,Horse Heaven Hills,ROSé,Syncline,44,25325,142349,468,536,523,15765


# Drop the features that have been encoded

In [24]:
train_df = train_df.drop(['country','description', 'province','region_1', 'winery', 'designation', 'variety' ], axis = 1)

In [25]:
train_df.head()

Unnamed: 0,points,price,countrylabel,designationlabel,descriptionlabel,provincelabel,region_1label,varietylabel,winerylabel
0,88.870874,20.0,34,42042,124840,8,1155,480,9470
1,88.041695,28.0,15,40935,13510,38,582,62,4345
2,94.085021,130.0,15,40935,131784,38,983,62,3564
3,89.869797,34.0,44,39632,86081,53,1014,443,9532
4,89.017651,24.0,44,25325,142349,468,536,523,15765


In [27]:
train_df.isnull().sum()

points              0
price               0
countrylabel        0
designationlabel    0
descriptionlabel    0
provincelabel       0
region_1label       0
varietylabel        0
winerylabel         0
dtype: int64

In [28]:
test_df = test_df.drop(['country','description', 'province','region_1', 'winery', 'designation', 'variety'], axis = 1)
test_df.head()

Unnamed: 0,points,price,countrylabel,designationlabel,descriptionlabel,provincelabel,region_1label,varietylabel,winerylabel
0,95.036469,,44,386,100464,53,1091,82,14649
1,90.966405,,15,17995,32814,350,292,597,10714
2,88.964358,,44,13908,87100,53,1023,72,1925
3,89.960356,,44,40935,152214,53,1060,130,10716
4,88.075501,,44,40935,80531,468,300,465,3153


# create the NN that we are going to use

In [29]:
import tensorflow.keras as keras
from  tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Reshape
from  tensorflow.keras.models import Model

In [30]:
main_input_country = Input(shape=(1,), dtype='int32', name='main_input_country')

# This embedding layer will encode the input
x1 = Embedding(output_dim = 16, input_dim= len(le_1.classes_), input_length=1)(main_input_country)
x1 = Reshape((16, ))(x1)
x1 = Dense(256, activation='relu')(x1)


main_input_designation = Input(shape=(1,), dtype='int32', name='main_input_designation')
x2 = Embedding(output_dim = 16, input_dim= len(le_2.classes_), input_length=1)(main_input_designation)
x2 = Reshape((16, ))(x2)
x2 = Dense(256, activation='relu')(x2)


main_input_description = Input(shape=(1,), dtype='int32', name='main_input_description')
x3 = Embedding(output_dim = 16, input_dim= len(le_3.classes_), input_length=1)(main_input_description)
x3 = Reshape((16, ))(x3)
x3 = Dense(256, activation='relu')(x3)

main_input_province = Input(shape=(1,), dtype='int32', name='main_input_province')
x4 = Embedding(output_dim = 16, input_dim= len(le_4.classes_), input_length=1)(main_input_province)
x4 = Reshape((16, ))(x4)
x4 = Dense(256, activation='relu')(x4)


main_input_region_1 = Input(shape=(1,), dtype='int32', name='main_input_region_1')
x5 = Embedding(output_dim = 16, input_dim= len(le_5.classes_), input_length=1)(main_input_region_1)
x5 = Reshape((16, ))(x5)
x5 = Dense(256, activation='relu')(x5)

# main_input_region_2 = Input(shape=(1,), dtype='int32', name='main_input_region_2')
# x6 = Embedding(output_dim = 16, input_dim= len(le_6.classes_), input_length=1)(main_input_region_2)
# x6 = Flatten()(x6)
# x6 = Dense(256, activation='relu')(x6)

# main_input_title= Input(shape=(1,), dtype='int32', name='main_input_title')
# x7 = Embedding(output_dim = 16, input_dim= len(le_7.classes_), input_length=1)(main_input_title)
# x7 = Flatten()(x7)
# x7 = Dense(256, activation='relu')(x7)


main_input_variety = Input(shape=(1,), dtype='int32', name='main_input_variety')
x8= Embedding(output_dim = 16, input_dim= len(le_8.classes_), input_length=1)(main_input_variety)
x8 = Reshape((16, ))(x8)
x8 = Dense(256, activation='relu')(x8) 


main_input_winery = Input(shape=(1,), dtype='int32', name='main_input_winery')
x9 = Embedding(output_dim = 16, input_dim= len(le_9.classes_), input_length=1)(main_input_winery)
x9 = Reshape((16, ))(x9)
x9 = Dense(256, activation='relu')(x9)



main_input_points= Input(shape=(1,), name='main_input_points')
x10 = Dense(256, activation='relu')(main_input_points)

# main_input_latent_description_0 = Input(shape=(1,), name='main_input_latent_description_0')
# x11 = Dense(16, activation='relu')(main_input_latent_description_0)

# main_input_latent_description_1 = Input(shape=(1,), name='main_input_latent_description_1')
# x12 = Dense(16, activation='relu')(main_input_latent_description_0)

# main_input_latent_description_2 = Input(shape=(1,), name='main_input_latent_description_2')
# x13= Dense(16, activation='relu')(main_input_latent_description_0)

# main_input_latent_description_3 = Input(shape=(1,), name='main_input_latent_description_3')
# x14 = Dense(16, activation='relu')(main_input_latent_description_3)


merged_vector = keras.layers.Concatenate()([x1, x2, x3, x4, x5, x8, x9, x10])
merged_vector = Dense(256, activation='relu')(merged_vector)
merged_vector = Dense(1024, activation='relu')(merged_vector)
output  = Dense(1, activation='linear')(merged_vector) 

In [31]:
predictions = Dense(1, activation='linear')(merged_vector)

# We define a trainable model linking the inputs to the predictions
model = Model(inputs=[ main_input_points,  main_input_country , main_input_designation, main_input_description, main_input_province, main_input_region_1, main_input_variety , main_input_winery ], outputs = predictions)

In [32]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input_country (InputLayer) [(None, 1)]          0                                            
__________________________________________________________________________________________________
main_input_designation (InputLa [(None, 1)]          0                                            
__________________________________________________________________________________________________
main_input_description (InputLa [(None, 1)]          0                                            
__________________________________________________________________________________________________
main_input_province (InputLayer [(None, 1)]          0                                            
______________________________________________________________________________________________

In [33]:
from tensorflow.keras import backend as K

In [34]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [35]:
model.compile(optimizer='adam', loss= root_mean_squared_error )

In [36]:
columns = train_df.columns.tolist()

In [37]:
columns.remove('price')
#columns.remove('winerylabel')

In [38]:
columns

['points',
 'countrylabel',
 'designationlabel',
 'descriptionlabel',
 'provincelabel',
 'region_1label',
 'varietylabel',
 'winerylabel']

# Train our model

In [39]:
model.fit(np.split(train_df[columns].values, 8, axis  = 1), train_df.price.values, epochs= 10, validation_split = 0.2)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 140000 samples, validate on 35000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe66c55c898>

In [40]:
test_df.head()

Unnamed: 0,points,price,countrylabel,designationlabel,descriptionlabel,provincelabel,region_1label,varietylabel,winerylabel
0,95.036469,,44,386,100464,53,1091,82,14649
1,90.966405,,15,17995,32814,350,292,597,10714
2,88.964358,,44,13908,87100,53,1023,72,1925
3,89.960356,,44,40935,152214,53,1060,130,10716
4,88.075501,,44,40935,80531,468,300,465,3153


In [41]:
y_pred = model.predict(np.split(test_df[columns].values, 8, axis  = 1))

# load the test data for the test

In [44]:
test_df = pd.read_csv('test.csv')

In [45]:
test_df['price'] = y_pred

In [46]:
test_df[['id', 'price']].to_csv('Kaggle_6.csv', index=False)

# Make prediction

In [48]:
y_pred_train = model.predict(np.split(train_df[columns].values, 8, axis  = 1))

In [49]:
y_actual_train = train_df["price"].values

In [50]:
y_data = pd.DataFrame(dict(y_true=y_actual_train, y_pred=y_pred_train.reshape(-1,)))

In [51]:
y_data

Unnamed: 0,y_true,y_pred
0,20.0,23.990189
1,28.0,30.809565
2,130.0,138.105270
3,34.0,32.668510
4,24.0,23.984982
5,13.0,11.147862
6,35.0,36.705254
7,42.0,39.633930
8,38.0,39.374805
9,14.0,16.173489
