In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import pandas as pd
import numpy as np

In [6]:
import pickle

with open('df.pkl', 'rb') as file:
    transformed_df = pickle.load(file)

In [43]:
with open('pre_pipeline.pkl', 'rb') as file:
    pipeline = pickle.load(file)

In [7]:
transformed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17053 entries, 0 to 17052
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   weather_cloudy           17053 non-null  float64
 1   weather_light snow/rain  17053 non-null  float64
 2   month_August             17053 non-null  float64
 3   month_December           17053 non-null  float64
 4   month_February           17053 non-null  float64
 5   month_January            17053 non-null  float64
 6   month_July               17053 non-null  float64
 7   month_June               17053 non-null  float64
 8   month_March              17053 non-null  float64
 9   month_May                17053 non-null  float64
 10  month_November           17053 non-null  float64
 11  month_October            17053 non-null  float64
 12  month_September          17053 non-null  float64
 13  dayofweek_Monday         17053 non-null  float64
 14  dayofweek_Saturday    

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train, test = train_test_split(transformed_df, test_size=.25)

In [10]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [11]:
X_train = train.drop('log_total_users', axis=1)
X_test = test.drop('log_total_users', axis=1)
y_train = train.log_total_users
y_test = test.log_total_users

In [12]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((12789, 24), (12789,), (4264, 24), (4264,))

## Model Fitting:

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error

### Random Forest

In [19]:
from sklearn.ensemble import RandomForestRegressor

In [20]:
rf = RandomForestRegressor(random_state=0)
params = {'n_estimators':np.arange(100,401,50), 'max_depth':np.arange(5,21,3),
           'max_features':['sqrt','log2','auto'], 'max_samples':[0.6,0.7], 'bootstrap':[True, False]
          }
grid = GridSearchCV(rf, param_grid=params, n_jobs=-1, verbose=1)
grid_model = grid.fit(X_train, y_train)

Fitting 5 folds for each of 504 candidates, totalling 2520 fits


In [21]:
rf_best = grid.best_estimator_
print('Best Hyperparameters:', grid.best_params_)

Best Hyperparameters: {'bootstrap': True, 'max_depth': 20, 'max_features': 'sqrt', 'max_samples': 0.7, 'n_estimators': 350}


In [22]:
rfr = rf_best.fit(X_train, y_train)
rfr_train_pred = rfr.predict(X_train)
rfr_test_pred = rfr.predict(X_test)

In [23]:
rfr_train_mse = mean_squared_error(rfr_train_pred, y_train)
rfr_test_mse = mean_squared_error(rfr_test_pred, y_test)
rfr_train_mse, rfr_test_mse

(0.06807002640446871, 0.22399216302514977)

In [24]:
rfr_train_mape = mean_absolute_percentage_error(rfr_train_pred, y_train)
rfr_test_mape = mean_absolute_percentage_error(rfr_test_pred, y_test)
rfr_train_mape, rfr_test_mape

(0.030556600989400252, 0.05444688724039325)

### Gradient Boosting

In [25]:
from sklearn.ensemble import GradientBoostingRegressor

In [28]:
gb = GradientBoostingRegressor(random_state=0)
params1 = {'n_estimators':np.arange(100,301,50), 'max_depth':np.arange(1,16,3),
           'max_features':['sqrt','log2','auto'], 'learning_rate':[0.01,0.05,0.1,0.5]
          }
grid1 = GridSearchCV(gb, param_grid=params1, n_jobs=-1, verbose=1)
grid_model1 = grid1.fit(X_train, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


In [29]:
gb_best = grid1.best_estimator_
print('Best Hyperparameters:', grid1.best_params_)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 300}


In [30]:
gbr = gb_best.fit(X_train, y_train)
gbr_train_pred = gbr.predict(X_train)
gbr_test_pred = gbr.predict(X_test)

In [31]:
gbr_train_mse = mean_squared_error(gbr_train_pred, y_train)
gbr_test_mse = mean_squared_error(gbr_test_pred, y_test)
gbr_train_mse, gbr_test_mse

(0.04338660145999777, 0.12222105503493534)

In [32]:
gbr_train_mape = mean_absolute_percentage_error(gbr_train_pred, y_train)
gbr_test_mape = mean_absolute_percentage_error(gbr_test_pred, y_test)
gbr_train_mape, gbr_test_mape

(0.02293345667920015, 0.03775889360216541)

### XGBoost

In [38]:
import xgboost

In [19]:
xg = xgboost.XGBRegressor(random_state=0)
params2 = {'eta':[0.01,0.1,0.3,0.5,1], 'max_delta_step':[1,3,5,7],
           'max_features':['sqrt','log2'], 'subsample':[0.5,0.6,0.7,1]
          }
grid2 = GridSearchCV(xg, param_grid=params2, n_jobs=-1, verbose=1, error_score='raise')
grid_model2 = grid2.fit(X_train, y_train)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are

In [20]:
xgb_best = grid2.best_estimator_
print('Best Hyperparameters:', grid2.best_params_)

Best Hyperparameters: {'eta': 0.3, 'max_delta_step': 1, 'max_features': 'sqrt', 'subsample': 1}


In [21]:
xgb = xgb_best.fit(X_train, y_train)
xgb_train_pred = xgb.predict(X_train)
xgb_test_pred = xgb.predict(X_test)

In [22]:
xgb_train_mse = mean_squared_error(xgb_train_pred, y_train)
xgb_test_mse = mean_squared_error(xgb_test_pred, y_test)
xgb_train_mse, xgb_test_mse

(0.04579695714310215, 0.1077975481370881)

In [23]:
xgb_train_mape = mean_absolute_percentage_error(xgb_train_pred, y_train)
xgb_test_mape = mean_absolute_percentage_error(xgb_test_pred, y_test)
xgb_train_mape, xgb_test_mape

(0.02325990417735471, 0.03468962679074967)

### Pickle

In [33]:
rf = RandomForestRegressor(bootstrap=True, max_depth=20, max_features='sqrt', max_samples=0.7, n_estimators=350, random_state=0)

In [34]:
rf_model = rf.fit(X_train, y_train)

In [36]:
gb = GradientBoostingRegressor(learning_rate=0.1, max_depth=7, max_features='sqrt', n_estimators=300, random_state=0)

In [37]:
gb_model = gb.fit(X_train, y_train)

In [39]:
xg = xgboost.XGBRegressor(eta=0.3, max_delta_step=1, max_features='sqrt', subsample=1, random_state=0)

In [40]:
xg_model = xg.fit(X_train, y_train)

In [46]:
models = {'Random_Forest':rf_model, 'Gradient_Boosting':gb_model, 'XGBoost':xg_model}

In [47]:
with open('models.pkl', 'wb') as file:
    pickle.dump(models, file)