In [100]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# 1. Loading the data

In [85]:
df = pd.read_csv('avocado.csv')
df

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,7,2018-02-04,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,organic,2018,WestTexNewMexico
18245,8,2018-01-28,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico
18246,9,2018-01-21,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico
18247,10,2018-01-14,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico


---

# 2. Preprocessing

## 2.1. Handling missing data

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    18249 non-null  int64  
 1   Date          18249 non-null  object 
 2   AveragePrice  18249 non-null  float64
 3   Total Volume  18249 non-null  float64
 4   4046          18249 non-null  float64
 5   4225          18249 non-null  float64
 6   4770          18249 non-null  float64
 7   Total Bags    18249 non-null  float64
 8   Small Bags    18249 non-null  float64
 9   Large Bags    18249 non-null  float64
 10  XLarge Bags   18249 non-null  float64
 11  type          18249 non-null  object 
 12  year          18249 non-null  int64  
 13  region        18249 non-null  object 
dtypes: float64(9), int64(2), object(3)
memory usage: 1.9+ MB


## 2.2. Extracting the target

In [87]:
target = df.pop('AveragePrice')
target

0        1.33
1        1.35
2        0.93
3        1.08
4        1.28
         ... 
18244    1.63
18245    1.71
18246    1.87
18247    1.93
18248    1.62
Name: AveragePrice, Length: 18249, dtype: float64

# 2.3. Removing unnecessary columns

In [88]:
df.drop(['Date', 'Unnamed: 0'], axis=1, inplace=True)

## 2.4. Splitting the data

In [89]:
train_data, test_data = model_selection.train_test_split(df, train_size=0.6, random_state=3)
train_data

Unnamed: 0,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
9042,99982.71,23281.10,29539.62,544.24,46617.75,22506.27,23986.48,125.00,conventional,2018,Spokane
2730,5617752.22,2686203.38,1907834.16,123261.81,900452.87,719727.54,180618.10,107.23,conventional,2015,West
10406,2141.08,1251.61,48.24,0.00,841.23,827.49,13.74,0.00,organic,2015,MiamiFtLauderdale
9250,13132.64,3946.70,4109.96,836.42,4239.56,4225.93,13.63,0.00,organic,2015,BaltimoreWashington
15559,208558.03,3639.54,80400.82,4.82,124204.92,72148.53,52056.39,0.00,organic,2017,GreatLakes
...,...,...,...,...,...,...,...,...,...,...,...
6400,129820.93,1645.17,64057.69,21287.24,42830.83,26704.25,7110.17,9016.41,conventional,2017,GrandRapids
15288,9058.74,525.02,1736.52,0.00,6797.20,4338.58,2458.62,0.00,organic,2017,Columbus
11513,64156.46,33408.72,10410.41,0.00,20337.33,19943.33,394.00,0.00,organic,2015,SouthCentral
1688,372062.30,15106.58,233808.22,1885.79,121261.71,109196.16,11757.42,308.13,conventional,2015,Philadelphia


## 2.5. Handling categorical attributes

In [90]:
train_data_types = pd.get_dummies(train_data.type, prefix='type')
train_data_regions = pd.get_dummies(train_data.region, prefix='region')
test_data_types = pd.get_dummies(test_data.type, prefix='type')
test_data_regions = pd.get_dummies(test_data.region, prefix='region')

train_data = train_data.drop(['type', 'region'], axis=1)
test_data = test_data.drop(['type', 'region'], axis=1)

train_data_regions

Unnamed: 0,region_Albany,region_Atlanta,region_BaltimoreWashington,region_Boise,region_Boston,region_BuffaloRochester,region_California,region_Charlotte,region_Chicago,region_CincinnatiDayton,...,region_SouthCarolina,region_SouthCentral,region_Southeast,region_Spokane,region_StLouis,region_Syracuse,region_Tampa,region_TotalUS,region_West,region_WestTexNewMexico
9042,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2730,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
10406,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9250,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15559,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11513,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1688,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2.6. Standardization of the cardinal data

In [91]:
scaler = StandardScaler()

train_data_std = pd.DataFrame(scaler.fit_transform(train_data.values),
                             columns=train_data.columns, index=train_data.index)
test_data_std = pd.DataFrame(scaler.fit_transform(test_data.values),
                             columns=test_data.columns, index=test_data.index)
test_data_std

Unnamed: 0,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,year
2258,-0.121311,-0.109471,-0.102850,-0.099151,-0.148586,-0.121598,-0.216404,-0.168030,-1.203683
11669,-0.237662,-0.227029,-0.235299,-0.209685,-0.233556,-0.235291,-0.216534,-0.168030,-1.203683
14336,-0.222923,-0.219725,-0.226886,-0.206004,-0.202337,-0.223764,-0.127049,-0.168030,-0.140667
8514,-0.215345,-0.197564,-0.234835,-0.128278,-0.202269,-0.205076,-0.182071,-0.167635,1.985363
8545,-0.187751,-0.217643,-0.159689,-0.208940,-0.163992,-0.217274,0.003478,-0.139576,1.985363
...,...,...,...,...,...,...,...,...,...
15556,-0.203551,-0.224927,-0.213363,-0.209613,-0.145251,-0.129227,-0.180644,-0.168030,0.922348
7621,-0.149925,-0.171453,-0.116866,-0.110890,-0.152415,-0.128299,-0.211744,-0.168030,0.922348
975,0.061197,0.147528,0.134510,-0.168688,-0.115053,-0.083793,-0.195466,-0.168030,-1.203683
3421,0.077591,0.256071,-0.068215,0.838448,-0.055498,-0.014901,-0.163467,-0.167849,-0.140667


## 2.7. Final stage

In [101]:
train_X = pd.concat([train_data_std, train_data_types, train_data_regions], axis=1)
test_X = pd.concat([test_data_std, test_data_types, test_data_regions], axis=1)

train_X

Unnamed: 0,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,year,type_conventional,...,region_SouthCarolina,region_SouthCentral,region_Southeast,region_Spokane,region_StLouis,region_Syracuse,region_Tampa,region_TotalUS,region_West,region_WestTexNewMexico
9042,-0.222658,-0.215822,-0.226697,-0.209709,-0.200807,-0.219695,-0.121985,-0.175679,1.960883,1,...,0,0,0,1,0,0,0,0,0,0
2730,1.470487,1.979281,1.422164,0.974945,0.731874,0.775356,0.584128,-0.176781,-1.233222,1,...,0,0,0,0,0,0,0,0,1,0
10406,-0.252681,-0.233982,-0.252586,-0.214963,-0.250811,-0.250634,-0.230056,-0.183426,-1.233222,0,...,0,0,0,0,0,0,0,0,0,0
9250,-0.249308,-0.231760,-0.249021,-0.206889,-0.247099,-0.245784,-0.230057,-0.183426,-1.233222,0,...,0,0,0,0,0,0,0,0,0,0
15559,-0.189341,-0.232013,-0.182049,-0.214917,-0.116055,-0.148847,0.004558,-0.183426,0.896181,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6400,-0.213502,-0.233657,-0.196396,-0.009467,-0.204944,-0.213704,-0.198065,0.375374,0.896181,1,...,0,0,0,0,0,0,0,0,0,0
15288,-0.250558,-0.234581,-0.251104,-0.214963,-0.244305,-0.245623,-0.219035,-0.183426,0.896181,0,...,0,0,0,0,0,0,0,0,0,0
11513,-0.233651,-0.207474,-0.243490,-0.214963,-0.229514,-0.223353,-0.228342,-0.183426,-1.233222,0,...,0,1,0,0,0,0,0,0,0,0
1688,-0.139169,-0.222561,-0.047380,-0.196759,-0.119270,-0.095974,-0.177115,-0.164330,-1.233222,1,...,0,0,0,0,0,0,0,0,0,0


In [102]:
train_y, test_y = model_selection.train_test_split(target, train_size=0.6, random_state=3)

---

# 3. The model

## 3.1. Creation of the model

In [107]:
input_shape = train_X.shape[1:]

model = Sequential()
model.add(Dense(input_shape[0]))
model.add(Dense(32))
model.add(Dense(16))
model.add(Dense(1))

model.compile(loss='mse', optimizer='sgd')

## 3.2. Training

In [108]:
model.fit(train_X, train_y, epochs=30, validation_split=0.15)

Epoch 1/30


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fc3df3c2850>

## 3.3. Testing

In [110]:
model.evaluate(test_X, test_y)
predictions = model.predict(test_X)
print(predictions[:3])

[[1.1456723]
 [1.6681132]
 [1.6481962]]
