# Load dependency

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, IterativeImputerte

# Load data

In [4]:
coffee_data = '../data/arabica_data_cleaned.csv'

cf = pd.read_csv(coffee_data)
cf = cf.iloc[:, 1:] # ship the number column

In [5]:
cf.head()

Unnamed: 0,Species,Owner,Country.of.Origin,Farm.Name,Lot.Number,Mill,ICO.Number,Company,Altitude,Region,...,Color,Category.Two.Defects,Expiration,Certification.Body,Certification.Address,Certification.Contact,unit_of_measurement,altitude_low_meters,altitude_high_meters,altitude_mean_meters
0,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,guji-hambela,...,Green,0,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0
1,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,guji-hambela,...,Green,1,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0
2,Arabica,grounds for health admin,Guatemala,"san marcos barrancas ""san cristobal cuch",,,,,1600 - 1800 m,,...,,0,"May 31st, 2011",Specialty Coffee Association,36d0d00a3724338ba7937c52a378d085f2172daa,0878a7d4b9d35ddbf0fe2ce69a2062cceb45a660,m,1600.0,1800.0,1700.0
3,Arabica,yidnekachew dabessa,Ethiopia,yidnekachew dabessa coffee plantation,,wolensu,,yidnekachew debessa coffee plantation,1800-2200,oromia,...,Green,2,"March 25th, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1800.0,2200.0,2000.0
4,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,guji-hambela,...,Green,2,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0


In [7]:
feature_selected = [
    'Country.of.Origin', 'Variety', 
    'Processing.Method', 'Moisture', 
    'unit_of_measurement', 'Color', 
    'altitude_mean_meters', 'Total.Cup.Points'
]
cf = cf[feature_selected]
cf

Unnamed: 0,Country.of.Origin,Variety,Processing.Method,Moisture,unit_of_measurement,Color,altitude_mean_meters,Total.Cup.Points
0,Ethiopia,,Washed / Wet,0.12,m,Green,2075.00,90.58
1,Ethiopia,Other,Washed / Wet,0.12,m,Green,2075.00,89.92
2,Guatemala,Bourbon,,0.00,m,,1700.00,89.75
3,Ethiopia,,Natural / Dry,0.11,m,Green,2000.00,89.00
4,Ethiopia,Other,Washed / Wet,0.12,m,Green,2075.00,88.83
...,...,...,...,...,...,...,...,...
1306,Mexico,Bourbon,Washed / Wet,0.11,m,,900.00,68.33
1307,Haiti,Typica,Natural / Dry,0.14,m,Blue-Green,350.00,67.92
1308,Nicaragua,Caturra,Other,0.13,m,Green,1100.00,63.08
1309,Guatemala,Catuai,Washed / Wet,0.10,ft,Green,1417.32,59.83


In [8]:
cf.columns = cf.columns.str.replace('.', '_')
cf.columns = cf.columns.str.title()
cf

  cf.columns = cf.columns.str.replace('.', '_')


Unnamed: 0,Country_Of_Origin,Variety,Processing_Method,Moisture,Unit_Of_Measurement,Color,Altitude_Mean_Meters,Total_Cup_Points
0,Ethiopia,,Washed / Wet,0.12,m,Green,2075.00,90.58
1,Ethiopia,Other,Washed / Wet,0.12,m,Green,2075.00,89.92
2,Guatemala,Bourbon,,0.00,m,,1700.00,89.75
3,Ethiopia,,Natural / Dry,0.11,m,Green,2000.00,89.00
4,Ethiopia,Other,Washed / Wet,0.12,m,Green,2075.00,88.83
...,...,...,...,...,...,...,...,...
1306,Mexico,Bourbon,Washed / Wet,0.11,m,,900.00,68.33
1307,Haiti,Typica,Natural / Dry,0.14,m,Blue-Green,350.00,67.92
1308,Nicaragua,Caturra,Other,0.13,m,Green,1100.00,63.08
1309,Guatemala,Catuai,Washed / Wet,0.10,ft,Green,1417.32,59.83


In [9]:
## data cleaning for units
unit_to_convert = cf['Unit_Of_Measurement'].eq('ft')
cf.loc[unit_to_convert, ['Altitude_Mean_Meters']] /= 3.281
cf['Unit_Of_Measurement'].replace(to_replace='ft', value='m', inplace=True)
# cf.drop(columns='unit_of_measurement')

In [10]:
cf.drop(columns="Unit_Of_Measurement", inplace=True)
cf

Unnamed: 0,Country_Of_Origin,Variety,Processing_Method,Moisture,Color,Altitude_Mean_Meters,Total_Cup_Points
0,Ethiopia,,Washed / Wet,0.12,Green,2075.000000,90.58
1,Ethiopia,Other,Washed / Wet,0.12,Green,2075.000000,89.92
2,Guatemala,Bourbon,,0.00,,1700.000000,89.75
3,Ethiopia,,Natural / Dry,0.11,Green,2000.000000,89.00
4,Ethiopia,Other,Washed / Wet,0.12,Green,2075.000000,88.83
...,...,...,...,...,...,...,...
1306,Mexico,Bourbon,Washed / Wet,0.11,,900.000000,68.33
1307,Haiti,Typica,Natural / Dry,0.14,Blue-Green,350.000000,67.92
1308,Nicaragua,Caturra,Other,0.13,Green,1100.000000,63.08
1309,Guatemala,Catuai,Washed / Wet,0.10,Green,431.978055,59.83


# split and preprocessing

In [11]:
X = cf.iloc[:, cf.columns != "Total_Cup_Points"]
y = cf["Total_Cup_Points"]
seed = 42

X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_other, y_other, test_size=0.25, shuffle=True, random_state=seed)

In [12]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(786, 6)
(262, 6)
(263, 6)


In [15]:
oh_ftr_list = ['Country_Of_Origin', 'Variety',
               'Processing_Method', 'Color']
std_ftr_list = ["Moisture", "Altitude_Mean_Meters"]

categorical_transformer = Pipeline(
    steps=
    [
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ]
)

numerical_transformer = Pipeline(
    steps=[
    ("scaler", StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, oh_ftr_list), 
        ('num', numerical_transformer, std_ftr_list)
    ]
)

X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)
X_val_prep = preprocessor.transform(X_val)
feature_names = preprocessor.get_feature_names_out()
feature_names

array(['cat__Country_Of_Origin_Brazil', 'cat__Country_Of_Origin_Burundi',
       'cat__Country_Of_Origin_China', 'cat__Country_Of_Origin_Colombia',
       'cat__Country_Of_Origin_Costa Rica',
       'cat__Country_Of_Origin_El Salvador',
       'cat__Country_Of_Origin_Ethiopia',
       'cat__Country_Of_Origin_Guatemala', 'cat__Country_Of_Origin_Haiti',
       'cat__Country_Of_Origin_Honduras',
       'cat__Country_Of_Origin_Indonesia', 'cat__Country_Of_Origin_Japan',
       'cat__Country_Of_Origin_Kenya', 'cat__Country_Of_Origin_Laos',
       'cat__Country_Of_Origin_Malawi',
       'cat__Country_Of_Origin_Mauritius',
       'cat__Country_Of_Origin_Mexico', 'cat__Country_Of_Origin_Myanmar',
       'cat__Country_Of_Origin_Nicaragua',
       'cat__Country_Of_Origin_Panama', 'cat__Country_Of_Origin_Peru',
       'cat__Country_Of_Origin_Philippines',
       'cat__Country_Of_Origin_Rwanda', 'cat__Country_Of_Origin_Taiwan',
       'cat__Country_Of_Origin_Tanzania, United Republic Of',
       '

# model construction

In [16]:
import tensorflow as tf

In [20]:
learning_rate_schedule = tf.keras.optimizer

297     83.75
3       89.00
966     81.25
579     82.75
288     83.83
        ...  
249     84.00
1166    79.58
1148    79.75
1130    79.92
1230    78.33
Name: Total_Cup_Points, Length: 786, dtype: float64

In [61]:
model = tf.keras.Sequential(
    layers=[
        tf.keras.layers.Dense(units=128, activation=tf.keras.layers.LeakyReLU(alpha=0.2)),
        tf.keras.layers.BatchNormalization(),
#         tf.keras.layers.Dense(units=32, activation=tf.keras.layers.LeakyReLU(alpha=0.2)),
#         tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(units=1),
        tf.keras.layers.ActivityRegularization(l1=2.0),
        tf.keras.layers.Rescaling(scale=5, offset=82)
    ]
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(2e-1),
    loss=tf.keras.losses.mean_absolute_error,
    metrics=tf.keras.metrics.mean_absolute_error
)

model.fit(
    x=X_train_prep,
    y=y_train,
    batch_size=5,
    epochs=100,
    validation_data=(X_val_prep, y_val),
)

Epoch 1/100


2022-12-01 00:52:25.744528: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-12-01 00:52:28.018594: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.callbacks.History at 0x356bb7a30>

In [59]:
X_train_prep[8]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        , -0.40198878, -0.01

In [27]:
model.predict(X_test_prep)



2022-11-30 22:51:28.951194: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


array([[nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
      