In [6]:
# Regression and xgboost models
from sklearn.linear_model import LinearRegression
import xgboost as xgb
# Model slection
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [7]:
import numpy as np
import pandas as pd

# Preprocessing

Import data from Github - (train & test data)

In [8]:
# import the data
df_train = pd.read_csv('https://raw.githubusercontent.com/ShaniPillay/regression-apples-predict-api-template/main/utils/data/train_data.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/ShaniPillay/regression-apples-predict-api-template/main/utils/data/test_data.csv')


Convert `Date` to datetime datatype

In [9]:
df_train['Date']= pd.to_datetime(df_train['Date'],format='%Y-%m-%d')
df_test['Date']= pd.to_datetime(df_test['Date'],format='%Y-%m-%d')

In [10]:
df_train.head(1)

Unnamed: 0,Province,Container,Size_Grade,Weight_Kg,Commodities,Date,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,avg_price_per_kg
0,CAPE,EC120,1L,12.0,APPLE GRANNY SMITH,2020-03-10,108.0,112.0,3236.0,29,348.0,0,9.3


In [11]:
df_test.head(1)

Unnamed: 0,Index,Province,Container,Size_Grade,Weight_Kg,Commodities,Date,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand
0,1,W.CAPE-BERGRIVER ETC,EC120,1M,12.0,APPLE GOLDEN DELICIOUS,2020-07-09,128.0,136.0,5008.0,38,456.0,0


We have to make sure that our train data has the same features as the test data. 
We drop index for the test data, train data doesnt have index

In [12]:
df_test = df_test.drop('Index', axis=1) # drop index for the test data, train data doesnt have index

We select the rows 'APPLE GOLDEN DELICIOUS' only from the `Commodities` column of the train data since we are only predicting the price for the apples. Note that the test data only has the 'APPLE GOLDEN DELICIOUS' commodity already.


In [13]:
train=df_train.loc[df_train['Commodities'] == 'APPLE GOLDEN DELICIOUS']
test=df_test

Drop the column `Commodities` because it's now redundant; all our data now refers to 'APPLE GOLDEN DELICIOUS'

In [14]:
train.drop('Commodities', axis=1, inplace=True) #df.drop(['B', 'C'], axis=1)
test.drop('Commodities', axis=1, inplace=True) #df.drop(['B', 'C'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Drop `Date` column because this model doesn't use the column. the inclusion of the column doesn't improve the model

In [15]:
train.drop('Date', axis=1, inplace=True) #df.drop(['B', 'C'], axis=1)
test.drop('Date', axis=1, inplace=True)

In [16]:
train.head(2)

Unnamed: 0,Province,Container,Size_Grade,Weight_Kg,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,avg_price_per_kg
1,CAPE,M4183,1L,18.3,150.0,170.0,51710.0,332,6075.6,822,8.51
7,CAPE,JG110,2M,11.0,50.0,50.0,16000.0,320,3520.0,0,4.55


#### Dummy Variable Encoding

A dummy variable is a numerical variable used in regression analysis to represent subgroups of the sample in your study.
Dummy variables are useful because they enable us to use a single regression equation to represent multiple groups. This means that we don’t need to write out separate equation models for each subgroup. The dummy variables act like ‘switches’ that turn various parameters on and off in an equation.
Below is a function that takes a dataframe and returns dummy variables of its specified categorical features in our dataset. 

In [17]:
### START FUNCTION
def dummy_encode_titles(input_df):
    temp_df = input_df.copy()
    
    # Apply Dummy Encoding 
    temp_df = pd.get_dummies(temp_df, columns=['Province','Container','Size_Grade'], drop_first=True)
    
    return temp_df
### END FUNCTION

In [18]:
train_withDummy=dummy_encode_titles(train) #for train data
test_withDummy=dummy_encode_titles(test) #for test data

In [19]:
train_withDummy.head(2)

Unnamed: 0,Weight_Kg,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,avg_price_per_kg,Province_EASTERN CAPE,Province_NATAL,...,Container_M9125,Size_Grade_1M,Size_Grade_1S,Size_Grade_1U,Size_Grade_1X,Size_Grade_2L,Size_Grade_2M,Size_Grade_2S,Size_Grade_2U,Size_Grade_2X
1,18.3,150.0,170.0,51710.0,332,6075.6,822,8.51,0,0,...,0,0,0,0,0,0,0,0,0,0
7,11.0,50.0,50.0,16000.0,320,3520.0,0,4.55,0,0,...,0,0,0,0,0,0,1,0,0,0


In [20]:
test_withDummy.head(2)

Unnamed: 0,Weight_Kg,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,Province_EASTERN CAPE,Province_NATAL,Province_ORANGE FREE STATE,...,Container_M9125,Size_Grade_1M,Size_Grade_1S,Size_Grade_1U,Size_Grade_1X,Size_Grade_2L,Size_Grade_2M,Size_Grade_2S,Size_Grade_2U,Size_Grade_2X
0,12.0,128.0,136.0,5008.0,38,456.0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,18.3,220.0,220.0,1760.0,8,146.4,2,0,0,0,...,0,0,0,0,1,0,0,0,0,0


Now the data is preprocessed and it's ready to be used in our two models i.e. Random Forest and XGBoost model

# MODEL 1: Random forest Model

### Train Random Forest Model

In [21]:
X = train_withDummy.drop('avg_price_per_kg',axis=1).values
#X = train_withDummy.drop(['avg_price_per_kg','Sales_Total'],axis=1).values  #df.drop(['B', 'C'], axis=1)
y = train_withDummy['avg_price_per_kg'].values

In [22]:
# Train test split
x_train, x_test, y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=6)

In [23]:
# Our forest consists of 100 trees with a max depth of 10
model_RF = RandomForestRegressor(n_estimators=100, max_depth=10, random_state = 42)
# Train the model on training data
model_RF.fit(x_train,y_train)

RandomForestRegressor(max_depth=10, random_state=42)

# Saving our model

In [35]:
import pickle

model_save_path = "RF_model.pkl"

with open(model_save_path,'wb') as file:
    pickle.dump(model_RF,file)

### Test Random Forest Model

Testing and getting predictions

In [19]:
from sklearn.metrics import mean_squared_error

In [24]:
# Get predictions
y_pred = model_RF.predict(np.array(test_withDummy)) #NOTE that we have to convert the 'test_withDummy' dataframe to an array

In [25]:
y_pred

array([10.77960637, 11.97935   ,  9.99788796,  8.74484298,  8.26759692,
        4.46093542,  3.63911472,  7.23485683,  8.78253153,  9.34109791,
        4.86715816,  5.6756    ,  8.95576088,  4.49274642,  4.52983803,
        6.11261901,  7.47923413,  4.49294411, 10.47075048,  6.02748946,
        5.65305733,  4.69277988,  5.39410809,  7.9135481 ,  5.56      ,
        6.49220951,  7.64208162,  7.96365512,  7.19642161,  7.11243433,
        7.97973028,  5.58095785,  9.86745405,  7.66802829,  4.84167664,
        9.81949519, 10.95082298, 11.41756214,  6.32114156,  6.53980795,
        8.53279584,  8.13348678,  5.01443514,  6.15921034,  5.56      ,
        5.32242861,  5.        ,  5.5688    ,  6.10613661,  5.20665667,
        1.3984    ,  6.28439013,  6.15880372, 14.1549    ,  8.3800587 ,
        5.7345204 , 11.96698   ,  8.71736315,  7.45858089,  6.04198706,
        7.26437319, 11.1323269 ,  6.15815764,  9.12480654,  6.32014277,
        6.19924762,  6.90055358,  5.16781851,  5.4819    ,  7.00

We now create a dataframe to submit to Kaggle:

In [22]:
# create submission dataframe

submission = pd.DataFrame(
    {
     'avg_price_per_kg': y_pred
    })
submission.index += 1 # start index from 1
submission["Index"] = submission.index #name the index column 'Index'
submission= submission[['Index', 'avg_price_per_kg']] 

In [23]:
submission

Unnamed: 0,Index,avg_price_per_kg
1,1,10.779606
2,2,11.979350
3,3,9.997888
4,4,8.744843
5,5,8.267597
...,...,...
681,681,3.215009
682,682,8.650971
683,683,6.113589
684,684,7.189324


In [24]:
# save DataFrame to csv file for submission (Give it any name you want)
submission.to_csv("ZM2_RF_model.csv", index=False)

#### Evaluating the model

In [25]:
test_pred1 = model_RF.predict(x_test)
    
test_rmse = np.sqrt(mean_squared_error(test_pred1,y_test))

{'Test RMSE': test_rmse}
    

{'Test RMSE': 0.4747883514835536}

This is the end of the Random Forest Model. We move on to the XGBoost Model'

# Model 2: XGBoost Model

After trying various models of xgboost, the one with only numerical features produced the best model.

Hence, we drop all categorical variables

In [26]:
train.drop(['Province','Container','Size_Grade'], axis=1, inplace=True)#df.drop(['B', 'C'], axis=1)
test.drop(['Province','Container','Size_Grade'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [27]:
train.head(1)

Unnamed: 0,Weight_Kg,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,avg_price_per_kg
1,18.3,150.0,170.0,51710.0,332,6075.6,822,8.51


In [28]:
test.head(1)

Unnamed: 0,Weight_Kg,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand
0,12.0,128.0,136.0,5008.0,38,456.0,0


### Train XGBoost Model

In [29]:
X = train.drop('avg_price_per_kg',axis=1).values  #df.drop(['B', 'C'], axis=1)
y = train['avg_price_per_kg']

In [30]:
# Train test split
x_train, x_test, y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=6)

In [31]:
# create an xgboost regression model
model_XGB = xgb.XGBRegressor()

In [32]:
# Train the model on training data
model_XGB.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [33]:
X.head(1)

AttributeError: 'numpy.ndarray' object has no attribute 'head'

# saving our model

In [None]:
import pickle

model_save_path = "XG_model.pkl"

with open(model_save_path,'wb') as file:
    pickle.dump(model_XGB,file)

### Test XGBoost Model

In [None]:
#Get predictions
y_pred2 = model_XGB.predict(np.array(test))

In [None]:
y_pred2

In [None]:
# create submission dataframe

submission = pd.DataFrame(
    {
     'avg_price_per_kg': y_pred2
    })
submission.index += 1
submission["Index"] = submission.index
submission= submission[['Index', 'avg_price_per_kg']]

In [None]:
submission

In [None]:
# save DataFrame to csv file for submission
submission.to_csv("ZM2_xgboost_model.csv", index=False)

#### Evaluating the model

In [None]:
test_pred2 = model_XGB.predict(x_test)
    
test_rmse = np.sqrt(mean_squared_error(test_pred2,y_test))

{'Test RMSE': test_rmse}

As we can see the XGBoost model has lower RMSE than the Random Forest model