# Tuning a Random Forest Regression Model

Custom built expanding window validation functions will be used 

In [47]:
import pandas as pd
import matplotlib.pyplot as plt

# Import your helper functions
from timeseries_model_utils import (
    expanding_window_splits,
    run_expanding_cv,
    expanding_window_grid_search
)

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error


In [49]:
df = pd.read_csv("New_csv.csv")   # replace with your file
df = df.sort_values("Year").reset_index(drop=True)
df.head()


Unnamed: 0,Year,Homicide,Assault,Sexual offences,Harm or endanger persons,"Robbery, blackmail, and extortion",Burglary,Theft,Fraud and related offences,Drug offences,...,"Public order, health, and safety offences",Traffic and vehicle offences,Offences against justice procedures and orders,Offences against government,Environmental offences,Miscellaneous offences,Total,Population,Crime_Index,Crime_Index_Lagged
0,1980,353,7208,1401,2176,426,11893,29103,19794,7839,...,28539,173948,27664,1376,3521,9651,331751,3112900,10.657297,
1,1981,366,7933,1092,2299,494,12974,29654,25007,8340,...,22637,142990,17981,2027,3360,9836,294409,3124900,9.421389,10.657297
2,1982,329,7760,1298,2231,504,14387,32682,26131,9518,...,18268,104739,13957,1203,4025,7878,252119,3156100,7.988308,9.421389
3,1983,407,8741,1449,2693,657,13851,32304,28481,10346,...,16067,105649,15332,736,2252,7774,254658,3199300,7.959804,7.988308
4,1984,406,9349,1682,2651,729,13905,33492,28069,12819,...,17456,116414,16389,1300,2235,8552,274022,3227100,8.491277,7.959804


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 22 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Year                                            45 non-null     int64  
 1   Homicide                                        45 non-null     int64  
 2   Assault                                         45 non-null     int64  
 3   Sexual offences                                 45 non-null     int64  
 4   Harm or endanger persons                        45 non-null     int64  
 5   Robbery, blackmail, and extortion               45 non-null     int64  
 6   Burglary                                        45 non-null     int64  
 7   Theft                                           45 non-null     int64  
 8   Fraud and related offences                      45 non-null     int64  
 9   Drug offences                                

In [51]:
target_col = "Crime_Index"  

feature_cols = [c for c in df.columns if c not in [ target_col]]

X = df[feature_cols]
y = df[target_col]


In [52]:
param_grid = {
    "n_estimators": [100, 200, 400],
    "max_depth": [3, 5, None]
}

best_params, best_score, results_df = expanding_window_grid_search(
    RandomForestRegressor,
    param_grid,
    X, y,
    initial_train_size=10,
    horizon=1,
    step=1
)

print("\nBest Parameters:", best_params)
print("Best Score:", best_score)

results_df



[GridSearch] Params {'n_estimators': 100, 'max_depth': 3} → Score = 0.4469
[GridSearch] Params {'n_estimators': 100, 'max_depth': 5} → Score = 0.4104
[GridSearch] Params {'n_estimators': 100, 'max_depth': None} → Score = 0.4130
[GridSearch] Params {'n_estimators': 200, 'max_depth': 3} → Score = 0.4414
[GridSearch] Params {'n_estimators': 200, 'max_depth': 5} → Score = 0.4144
[GridSearch] Params {'n_estimators': 200, 'max_depth': None} → Score = 0.4045
[GridSearch] Params {'n_estimators': 400, 'max_depth': 3} → Score = 0.4330
[GridSearch] Params {'n_estimators': 400, 'max_depth': 5} → Score = 0.3991
[GridSearch] Params {'n_estimators': 400, 'max_depth': None} → Score = 0.4057

Best Parameters: {'n_estimators': np.float64(400.0), 'max_depth': np.float64(5.0)}
Best Score: 0.3991015210797483


Unnamed: 0,n_estimators,max_depth,score
0,100,3.0,0.446937
1,100,5.0,0.410434
2,100,,0.413027
3,200,3.0,0.441433
4,200,5.0,0.414386
5,200,,0.404482
6,400,3.0,0.432994
7,400,5.0,0.399102
8,400,,0.405693


In [53]:

param_grid = {
    "alpha": [0.001, 0.01, 0.1, 1, 10, 50, 100],
    "max_iter": [1000, 5000, 10000]
}


In [54]:

X = X.ffill().bfill()


In [55]:
from sklearn.linear_model import Lasso

param_grid = {
    "alpha": [0.001, 0.01, 0.1, 1, 10, 50, 100],
    "max_iter": [1000, 5000, 10000],
}

best_params, best_score, results_df = expanding_window_grid_search(
    Lasso,
    param_grid,
    X,
    y,
    initial_train_size=10,
    horizon=1,
    step=1
)

print("\nBest Parameters (Lasso):", best_params)
print("Best Score:", best_score)
results_df


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[GridSearch] Params {'alpha': 0.001, 'max_iter': 1000} → Score = 0.0963


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[GridSearch] Params {'alpha': 0.001, 'max_iter': 5000} → Score = 0.1045
[GridSearch] Params {'alpha': 0.001, 'max_iter': 10000} → Score = 0.1046


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[GridSearch] Params {'alpha': 0.01, 'max_iter': 1000} → Score = 0.1046
[GridSearch] Params {'alpha': 0.01, 'max_iter': 5000} → Score = 0.1235
[GridSearch] Params {'alpha': 0.01, 'max_iter': 10000} → Score = 0.1235
[GridSearch] Params {'alpha': 0.1, 'max_iter': 1000} → Score = 0.1068


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

[GridSearch] Params {'alpha': 0.1, 'max_iter': 5000} → Score = 0.0876
[GridSearch] Params {'alpha': 0.1, 'max_iter': 10000} → Score = 0.0801


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

[GridSearch] Params {'alpha': 1, 'max_iter': 1000} → Score = 0.0680
[GridSearch] Params {'alpha': 1, 'max_iter': 5000} → Score = 0.0559
[GridSearch] Params {'alpha': 1, 'max_iter': 10000} → Score = 0.0559
[GridSearch] Params {'alpha': 10, 'max_iter': 1000} → Score = 0.0627
[GridSearch] Params {'alpha': 10, 'max_iter': 5000} → Score = 0.0628
[GridSearch] Params {'alpha': 10, 'max_iter': 10000} → Score = 0.0628
[GridSearch] Params {'alpha': 50, 'max_iter': 1000} → Score = 0.0743
[GridSearch] Params {'alpha': 50, 'max_iter': 5000} → Score = 0.0743
[GridSearch] Params {'alpha': 50, 'max_iter': 10000} → Score = 0.0743
[GridSearch] Params {'alpha': 100, 'max_iter': 1000} → Score = 0.0818
[GridSearch] Params {'alpha': 100, 'max_iter': 5000} → Score = 0.0818
[GridSearch] Params {'alpha': 100, 'max_iter': 10000} → Score = 0.0818

Best Parameters (Lasso): {'alpha': np.float64(1.0), 'max_iter': np.float64(5000.0)}
Best Score: 0.055914448365105264


Unnamed: 0,alpha,max_iter,score
0,0.001,1000,0.096327
1,0.001,5000,0.104543
2,0.001,10000,0.104626
3,0.01,1000,0.104579
4,0.01,5000,0.123476
5,0.01,10000,0.12354
6,0.1,1000,0.106785
7,0.1,5000,0.087618
8,0.1,10000,0.080142
9,1.0,1000,0.068039


In [56]:
X

Unnamed: 0,Year,Homicide,Assault,Sexual offences,Harm or endanger persons,"Robbery, blackmail, and extortion",Burglary,Theft,Fraud and related offences,Drug offences,...,Property damage,"Public order, health, and safety offences",Traffic and vehicle offences,Offences against justice procedures and orders,Offences against government,Environmental offences,Miscellaneous offences,Total,Population,Crime_Index_Lagged
0,1980,353,7208,1401,2176,426,11893,29103,19794,7839,...,4290,28539,173948,27664,1376,3521,9651,331751,3112900,10.657297
1,1981,366,7933,1092,2299,494,12974,29654,25007,8340,...,4938,22637,142990,17981,2027,3360,9836,294409,3124900,10.657297
2,1982,329,7760,1298,2231,504,14387,32682,26131,9518,...,4914,18268,104739,13957,1203,4025,7878,252119,3156100,9.421389
3,1983,407,8741,1449,2693,657,13851,32304,28481,10346,...,5323,16067,105649,15332,736,2252,7774,254658,3199300,7.988308
4,1984,406,9349,1682,2651,729,13905,33492,28069,12819,...,5727,17456,116414,16389,1300,2235,8552,274022,3227100,7.959804
5,1985,447,9671,1816,2901,997,13685,36226,28449,13519,...,5870,18512,109164,16894,895,2497,7561,272657,3247100,8.491277
6,1986,464,10244,1917,3143,970,12914,36251,26475,15461,...,6543,18856,130905,18638,1059,2839,7553,298086,3246300,8.396939
7,1987,485,11476,2180,3204,864,12892,36111,27397,15045,...,5878,15250,126879,20656,475,2789,6921,292500,3274400,9.182331
8,1988,529,12469,2440,3346,933,12233,34175,29323,15344,...,6545,15706,136190,25120,406,2523,7362,308817,3283400,8.932934
9,1989,603,11567,2442,3265,812,11081,30824,26463,13399,...,5481,11661,112891,26350,246,2627,7561,270585,3299200,9.405403


# train a lasso with alpha 0.001 and max iter 1000 and save it

In [80]:
from sklearn.linear_model import Lasso

# Initialize model
lasso = Lasso(alpha=0.001, max_iter=1000)

# Train model
lasso.fit(X, y)


  model = cd_fast.enet_coordinate_descent(


In [82]:
import joblib

# Save model
joblib.dump(lasso, "lasso_alpha_0_001_maxiter_1000.pkl")


['lasso_alpha_0_001_maxiter_1000.pkl']

In [84]:
from sklearn.linear_model import Lasso
import joblib

# Train Lasso model
lasso = Lasso(alpha=0.001, max_iter=1000)
lasso.fit(X, y)

# Save as .joblib file
joblib.dump(lasso, "lasso_alpha_0_001_maxiter_1000.joblib")


  model = cd_fast.enet_coordinate_descent(


['lasso_alpha_0_001_maxiter_1000.joblib']