In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [2]:
df = pd.read_csv('avocado.csv', index_col='Unnamed: 0')
y = df['Total Volume'].to_numpy()

# Remove the PLU numbers and time series data, as well as the Y labels.
df.drop(['4046', '4225', '4770', 'Total Volume', 'Small Bags', 'Large Bags', 'XLarge Bags'], axis=1, inplace=True)
df

Unnamed: 0,Date,AveragePrice,Total Bags,type,year,region
0,2015-12-27,1.33,8696.87,conventional,2015,Albany
1,2015-12-20,1.35,9505.56,conventional,2015,Albany
2,2015-12-13,0.93,8145.35,conventional,2015,Albany
3,2015-12-06,1.08,5811.16,conventional,2015,Albany
4,2015-11-29,1.28,6183.95,conventional,2015,Albany
...,...,...,...,...,...,...
7,2018-02-04,1.63,13498.67,organic,2018,WestTexNewMexico
8,2018-01-28,1.71,9264.84,organic,2018,WestTexNewMexico
9,2018-01-21,1.87,9394.11,organic,2018,WestTexNewMexico
10,2018-01-14,1.93,10969.54,organic,2018,WestTexNewMexico


In [3]:
df['Date'] = df['Date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
# monday = 0
df['day of week'] = df['Date'].dt.dayofweek
dates = ['year', 'month', 'day', 'day of week']
df.drop('Date', axis=1, inplace=True)
df

Unnamed: 0,AveragePrice,Total Bags,type,year,region,month,day,day of week
0,1.33,8696.87,conventional,2015,Albany,12,27,6
1,1.35,9505.56,conventional,2015,Albany,12,20,6
2,0.93,8145.35,conventional,2015,Albany,12,13,6
3,1.08,5811.16,conventional,2015,Albany,12,6,6
4,1.28,6183.95,conventional,2015,Albany,11,29,6
...,...,...,...,...,...,...,...,...
7,1.63,13498.67,organic,2018,WestTexNewMexico,2,4,6
8,1.71,9264.84,organic,2018,WestTexNewMexico,1,28,6
9,1.87,9394.11,organic,2018,WestTexNewMexico,1,21,6
10,1.93,10969.54,organic,2018,WestTexNewMexico,1,14,6


In [4]:
enc = OneHotEncoder()
ohe_features = enc.fit_transform(df[['type', 'region']]).toarray()
enc.categories_

[array(['conventional', 'organic'], dtype=object),
 array(['Albany', 'Atlanta', 'BaltimoreWashington', 'Boise', 'Boston',
        'BuffaloRochester', 'California', 'Charlotte', 'Chicago',
        'CincinnatiDayton', 'Columbus', 'DallasFtWorth', 'Denver',
        'Detroit', 'GrandRapids', 'GreatLakes', 'HarrisburgScranton',
        'HartfordSpringfield', 'Houston', 'Indianapolis', 'Jacksonville',
        'LasVegas', 'LosAngeles', 'Louisville', 'MiamiFtLauderdale',
        'Midsouth', 'Nashville', 'NewOrleansMobile', 'NewYork',
        'Northeast', 'NorthernNewEngland', 'Orlando', 'Philadelphia',
        'PhoenixTucson', 'Pittsburgh', 'Plains', 'Portland',
        'RaleighGreensboro', 'RichmondNorfolk', 'Roanoke', 'Sacramento',
        'SanDiego', 'SanFrancisco', 'Seattle', 'SouthCarolina',
        'SouthCentral', 'Southeast', 'Spokane', 'StLouis', 'Syracuse',
        'Tampa', 'TotalUS', 'West', 'WestTexNewMexico'], dtype=object)]

In [5]:
numeric_features = df.drop(['type', 'region'], axis=1).to_numpy()
print(numeric_features.shape, ohe_features.shape)

X = np.concatenate([numeric_features, ohe_features], axis=1)
print(X.shape, y.shape)

(18249, 6) (18249, 56)
(18249, 62) (18249,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Create linear regression object
gb_regr = GradientBoostingRegressor()

# Train the model using the training sets
gb_regr.fit(X_train, y_train)

# Make predictions using the testing set
gb_y_pred = gb_regr.predict(X_test)

# The mean squared error
print(f'Mean squared error: {mean_squared_error(y_test, gb_y_pred)}')
# The coefficient of determination: 1 is perfect prediction
print(f'Coefficient of determination: {r2_score(y_test, gb_y_pred)}')
# Training set score
print(f'Training set score: {gb_regr.score(X_train, y_train)}')
# Testing set score
print(f'Test set score: {gb_regr.score(X_test, y_test)}')

Mean squared error: 210487701126.8486
Coefficient of determination: 0.9841338185985168
Training set score: 0.9951313473516881
Test set score: 0.9841338185985168


In [8]:
from joblib import dump, load
dump(gb_regr, 'model.joblib') 

['model.joblib']