### 1. Import the Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
import pickle

### 2. Import the Dataset

In [2]:
df = pd.read_csv("avocado.csv")

### 3. Data Preprocessing

In [3]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015
2,2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015
3,3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015
4,4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    18249 non-null  int64  
 1   Date          18249 non-null  object 
 2   AveragePrice  18249 non-null  float64
 3   Total Volume  18249 non-null  float64
 4   4046          18249 non-null  float64
 5   4225          18249 non-null  float64
 6   4770          18249 non-null  float64
 7   Total Bags    18249 non-null  float64
 8   Small Bags    18249 non-null  float64
 9   Large Bags    18249 non-null  float64
 10  XLarge Bags   18249 non-null  float64
 11  type          18249 non-null  object 
 12  year          18249 non-null  int64  
dtypes: float64(9), int64(2), object(2)
memory usage: 1.8+ MB


In [5]:
df = df.drop(['Unnamed: 0', 'Date', 'Total Volume', 'Total Bags', 'type'], axis=1)

In [6]:
df = df.rename(columns={'AveragePrice': 'Price', '4046': 'PLU 4046', '4225': 'PLU 4225', '4770': 'PLU 4770', 
                        'year': 'Year'})

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Price        18249 non-null  float64
 1   PLU 4046     18249 non-null  float64
 2   PLU 4225     18249 non-null  float64
 3   PLU 4770     18249 non-null  float64
 4   Small Bags   18249 non-null  float64
 5   Large Bags   18249 non-null  float64
 6   XLarge Bags  18249 non-null  float64
 7   Year         18249 non-null  int64  
dtypes: float64(7), int64(1)
memory usage: 1.1 MB


### 4. Splitting the Dataset

In [8]:
X = df.drop('Price', axis=1)
Y = df['Price']

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2)

### 5. Grid Search CV

In [12]:
param_grid = {'n_estimators': [50, 100, 200],
              'max_depth': [None, 10, 20, 30],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}

##### 5.1 Create the GridSearchCV Object

In [13]:
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv = 2, refit = True, verbose = 3, n_jobs=10)

##### 5.2 Perform Grid Search

In [14]:
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 108 candidates, totalling 216 fits


##### 5.3 Get the best model and its parameters

In [15]:
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

Best parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


##### 5.4 Predictions on the testing set

In [16]:
y_pred = best_model.predict(X_test)

##### 5.5 Evaluate Performance

In [17]:
mse = np.mean((y_test - y_pred)**2)
print('Mean Squared Error: {:.2f}'.format(mse))

Mean Squared Error: 0.03


### 6. Predictive Model

In [18]:
predictor = RandomForestRegressor(max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=200)
predictor.fit(X_train, y_train)
predictive_input = [[129, 1322, 1109, 14, 12, 7, 2024]]
predictive_input_array = np.asarray(predictive_input).reshape(1, -1)
prediction = predictor.predict(predictive_input_array)
print('Prediction: {:.2f}'.format(prediction[0]))

Prediction: 0.85


In [19]:
filename = 'predictor_model.sav'
pickle.dump(predictor, open(filename, 'wb'))