In [2]:
import json 
import numpy as np
import pandas as pd 
import joblib

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import ExtraTreesClassifier

import warnings
warnings.filterwarnings("ignore")


In [3]:
df = pd.read_csv('data/avocado.csv')
X = df.drop(['Unnamed: 0', 'Date', 'Total Volume', 'year'], axis=1)[1:1000]
y = df['Total Volume'][1:1000]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

In [5]:
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
train_mode

{'AveragePrice': 1.1,
 '4046': 509.94,
 '4225': 2985.73,
 '4770': 21.5,
 'Total Bags': 3047.17,
 'Small Bags': 1119.18,
 'Large Bags': 0.0,
 'XLarge Bags': 0.0,
 'type': 'conventional',
 'region': 'Columbus'}

In [6]:
encoders = {}

for column in ['type', 'region']:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    encoders[column] = categorical_convert

In [7]:
training_score_encoder = categorical_convert.fit_transform(y_train)

In [9]:
et = ExtraTreesClassifier(n_estimators = 100)
et = et.fit(X_train, training_score_encoder)

In [10]:
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X_train, training_score_encoder)

In [11]:
# save preprocessing objects and RF algorithm
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']