In [1]:
import pandas as pd

df = pd.read_csv('../final_data.csv')
df.head()

Unnamed: 0,Company name,TSR,Time,Engineered_PRASM,Engineered_RASM,Engineered_CASM,Engineered_Load_factor,Engineered_Gross_profit_margin,Engineered_Quick_ratio,Engineered_D/E,...,Engineered_Gross_profit_margin_lag2,Engineered_Quick_ratio_lag1,Engineered_Quick_ratio_lag2,Engineered_D/E_lag1,Engineered_D/E_lag2,Engineered_ROA_lag1,Engineered_ROA_lag2,Engineered_EPS_lag1,Engineered_EPS_lag2,TSR_rolling_mean
0,0,0.161,2013.25,23152.375,28205.6325,227.49725,1723.342,479.9588,2234.7075,1409.275,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.156,2013.5,24162.0,29316.56,229.539,1711.475,527.9397,2214.85,1365.153,...,0.0,2234.7075,0.0,1409.275,0.0,114.151275,0.0,1248.215,0.0,0.0
2,0,0.096,2013.75,25171.875,30407.625,251.517375,1719.7425,617.214375,2295.675,1389.4875,...,479.9588,2214.85,2234.7075,1365.153,1409.275,124.837,114.151275,1510.125,1248.215,0.0
3,0,0.165,2014.0,26182.0,30210.0,262.2228,1723.984,472.283,2175.12,1490.36,...,527.9397,2295.675,2214.85,1389.4875,1365.153,165.1275,124.837,4168.4625,1510.125,0.1445
4,0,0.181,2014.25,27836.935,29206.625,271.92375,1641.61375,154.895825,2155.2475,1490.545,...,617.214375,2175.12,2295.675,1490.36,1389.4875,176.4264,165.1275,2235.54,4168.4625,0.1495


In [2]:
def split_data(df, year_split=2022):
    """
    Split the data into training and test sets based on the year column.
    
    Parameters:
    - df: DataFrame containing the full dataset.
    - target_column: The name of the target variable column.
    - year_split: The year to split on (default is 2022).
    
    Returns:
    - train_data: DataFrame containing the training data.
    - test_data: DataFrame containing the test data.
    """
    train_data = df[df['Time'] <= year_split]
    test_data = df[df['Time'] > year_split]
    
    return train_data, test_data

In [3]:
train_data, test_data = split_data(df)
print(train_data.shape)
print(test_data.shape)

(324, 33)
(72, 33)


In [4]:
import joblib

def scale_data(data, scaler_path):
    scaler = joblib.load(scaler_path)
    
    scaled_col = data.drop(['Company name', 'TSR', 'Time'], axis=1)
    
    data_scaled = scaler.transform(scaled_col)
    scaled_df = pd.DataFrame(data_scaled, columns=scaled_col.columns, index=data.index)
    final_data_scaled = data[['Company name', 'TSR', 'Time']].join(scaled_df)
    
    return final_data_scaled


# models

In [6]:
scaler_path = './scaler_folder/minmax_scaler.joblib'
train_data_scaled = scale_data(train_data, scaler_path)
train_data_scaled

Unnamed: 0,Company name,TSR,Time,Engineered_PRASM,Engineered_RASM,Engineered_CASM,Engineered_Load_factor,Engineered_Gross_profit_margin,Engineered_Quick_ratio,Engineered_D/E,...,Engineered_Gross_profit_margin_lag2,Engineered_Quick_ratio_lag1,Engineered_Quick_ratio_lag2,Engineered_D/E_lag1,Engineered_D/E_lag2,Engineered_ROA_lag1,Engineered_ROA_lag2,Engineered_EPS_lag1,Engineered_EPS_lag2,TSR_rolling_mean
0,0,0.1610,2013.25,0.322983,0.235652,0.000452,0.008172,0.859217,0.191851,0.569805,...,0.754579,0.000000,0.000000,0.530865,0.530865,0.478699,0.478699,0.529480,0.529480,0.856352
1,0,0.1560,2013.50,0.337168,0.245119,0.000483,0.008098,0.869678,0.190146,0.568586,...,0.754579,0.191851,0.000000,0.569805,0.530865,0.697455,0.478699,0.564614,0.529480,0.856352
2,0,0.0960,2013.75,0.351356,0.254418,0.000816,0.008149,0.889141,0.197085,0.569258,...,0.859217,0.190146,0.191851,0.568586,0.569805,0.717933,0.697455,0.571986,0.564614,0.856352
3,0,0.1650,2014.00,0.365548,0.252733,0.000978,0.008176,0.857544,0.186735,0.572045,...,0.869678,0.197085,0.190146,0.569258,0.568586,0.795144,0.717933,0.646812,0.571986,0.866079
4,0,0.1810,2014.25,0.388800,0.244182,0.001125,0.007665,0.788349,0.185029,0.572050,...,0.889141,0.186735,0.197085,0.572045,0.569258,0.816797,0.795144,0.592405,0.646812,0.866415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383,8,0.0667,2021.00,0.241608,0.176123,0.538815,0.004457,0.576616,0.189119,0.763728,...,0.000000,0.170013,0.095404,0.749182,0.655363,0.161155,0.357505,0.065346,0.000000,0.847861
384,8,0.2553,2021.25,0.298718,0.310505,0.431977,0.005009,0.824160,0.194349,0.719636,...,0.406936,0.189119,0.170013,0.763728,0.749182,0.000000,0.161155,0.165977,0.065346,0.858469
385,8,0.0690,2021.50,0.182309,0.195118,0.447349,0.005636,0.890805,0.258585,0.719101,...,0.576616,0.194349,0.189119,0.719636,0.763728,0.393483,0.000000,0.245013,0.165977,0.865520
386,8,-0.1613,2021.75,0.273228,0.245109,0.432085,0.009644,0.895847,0.244731,0.716331,...,0.824160,0.258585,0.194349,0.719101,0.719636,0.452357,0.393483,0.441853,0.245013,0.860217


In [7]:
scaler_path = './scaler_folder/minmax_scaler.joblib'
test_data_scaled = scale_data(test_data, scaler_path)
test_data_scaled.head()

Unnamed: 0,Company name,TSR,Time,Engineered_PRASM,Engineered_RASM,Engineered_CASM,Engineered_Load_factor,Engineered_Gross_profit_margin,Engineered_Quick_ratio,Engineered_D/E,...,Engineered_Gross_profit_margin_lag2,Engineered_Quick_ratio_lag1,Engineered_Quick_ratio_lag2,Engineered_D/E_lag1,Engineered_D/E_lag2,Engineered_ROA_lag1,Engineered_ROA_lag2,Engineered_EPS_lag1,Engineered_EPS_lag2,TSR_rolling_mean
36,0,0.0909,2022.25,0.381261,0.250341,0.462839,0.007142,0.906154,0.239584,0.567185,...,0.911979,0.253441,0.255146,0.567181,0.566617,0.573247,0.567811,0.779903,0.757109,0.860404
37,0,-0.0413,2022.5,0.401199,0.264162,0.453702,0.007832,0.910186,0.237877,0.567748,...,0.916231,0.239584,0.253441,0.567185,0.567181,0.581009,0.573247,0.802702,0.779903,0.858179
38,0,0.0598,2022.75,0.423984,0.277986,0.441496,0.008147,0.913821,0.23617,0.568312,...,0.906154,0.237877,0.239584,0.567748,0.567185,0.587223,0.581009,0.825508,0.802702,0.857784
39,0,0.04,2023.0,0.418353,0.272849,0.447683,0.00796,0.91706,0.234462,0.568317,...,0.910186,0.23617,0.237877,0.568312,0.567748,0.593051,0.587223,0.848318,0.825508,0.858866
40,0,0.1,2023.25,0.395664,0.260813,0.456937,0.007523,0.919992,0.232754,0.56888,...,0.913821,0.234462,0.23617,0.568317,0.568312,0.598493,0.593051,0.871135,0.848318,0.859019


In [8]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

def gradient_boosting_train(train_data, target_column, feature_columns, n_splits=5, scoring='neg_root_mean_squared_error'):
  param_grid = {
    'learning_rate': [0.05, 0.2, 0.1],
    'min_samples_split':[2, 3, 5],
    'max_depth': [3, 4, 5],
    'min_samples_leaf': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'n_estimators': [100, 200, 300]
  }
  X_train = train_data[feature_columns]
  y_train = train_data[target_column]
  gbr = GradientBoostingRegressor(random_state=42)
  tscv = TimeSeriesSplit(n_splits=n_splits)
  
  grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=tscv, scoring=scoring, n_jobs=-1, verbose=2)
  grid_search.fit(X_train, y_train)
  
  # get best params + estimators
  best_params = grid_search.best_params_
  best_model = grid_search.best_estimator_
  
  # calculate r2
  y_train_pred = best_model.predict(X_train)
  r2_train = r2_score(y_train, y_train_pred)
  
  return best_model, best_params, r2_train


In [16]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

def train_random_forest(train_data, target_column, feature_columns, n_splits=5, scoring='neg_root_mean_squared_error'):
    
    # Define the hyperparameter grid for Random Forest inside the function
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    }
    
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]
    
    rf = RandomForestRegressor(random_state=42)
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=tscv, scoring=scoring, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # Calculate R² on the training data
    y_train_pred = best_model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    return best_model, best_params, r2_train

In [19]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from xgboost.sklearn import XGBRegressor 
from sklearn.metrics import r2_score

def train_xgb(train_data, target_column, feature_columns, n_splits=5, scoring='neg_root_mean_squared_error'):
    param_grid = {
        'learning_rate': [0.3, 0.5, 0.7], 
        "gamma":[ 0.0, 0.1, 0.2],
        "max_depth": [3, 5, 6, 10],
        "min_child_weight": [ 1, 3, 5, 7],
        "colsample_bytree":[ 0.3, 0.4, 0.7],
        'n_estimators': [100, 200, 300]
    }
    
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]
    
    xgb = XGBRegressor(random_state=42)
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=tscv, scoring=scoring, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # Calculate R² on the training data
    y_train_pred = best_model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    return best_model, best_params, r2_train

# evaluate

In [9]:
from sklearn.metrics import mean_squared_error

def evaluate_model(model, test_data, target_column, feature_columns):
  X_test = test_data[feature_columns]
  y_test = test_data[target_column]
  y_pred = model.predict(X_test)
  # Calculate RMSE
  rmse = mean_squared_error(y_test, y_pred, squared=False)
  # Calculate R² score\n",
  # r2 = r2_score(y_test, y_pred)\n",
  return rmse

In [18]:
import joblib
def save_file(scaler, filename):
  joblib.dump(scaler, filename) 

In [10]:
feature_columns = [col for col in train_data_scaled.columns if col != "TSR"]

### gradient boosting

In [21]:
best_model, best_params, r2_train = gradient_boosting_train(train_data_scaled, target_column="TSR", feature_columns=feature_columns)
# evaluation
rmse = evaluate_model(best_model, test_data_scaled, target_column="TSR", feature_columns=feature_columns)

print(f"\n Best Hyperparameters: {best_params}"),
print(f"\n R2 on Train-set: {r2_train}"),
print(f"\n Root Mean Squared Error (RMSE) on Test Set: {rmse}")

Fitting 5 folds for each of 729 candidates, totalling 3645 fits

 Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.6}

 R2 on Train-set: 0.9999900162859513

 Root Mean Squared Error (RMSE) on Test Set: 0.07124991802932935




In [23]:
gbr_path = "./model_folder/gbr.joblib"
save_file(best_model, gbr_path)

### random forest

In [17]:
best_model, best_params, r2_train = train_random_forest(train_data_scaled, target_column="TSR", feature_columns=feature_columns)
# evaluation
rmse = evaluate_model(best_model, test_data_scaled, target_column="TSR", feature_columns=feature_columns)

print(f"\n Best Hyperparameters: {best_params}"),
print(f"\n R2 on Train-set: {r2_train}"),
print(f"\n Root Mean Squared Error (RMSE) on Test Set: {rmse}")

Fitting 5 folds for each of 324 candidates, totalling 1620 fits

 Best Hyperparameters: {'max_depth': 20, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}

 R2 on Train-set: 0.4132775851563426

 Root Mean Squared Error (RMSE) on Test Set: 0.1949319786871062




In [20]:
rf_path = "./model_folder/random_forest.joblib"
save_file(best_model, rf_path)

### xgb

In [None]:
best_model, best_params, r2_train = train_random_forest(train_data_scaled, target_column="TSR", feature_columns=feature_columns)
# evaluation
rmse = evaluate_model(best_model, test_data_scaled, target_column="TSR", feature_columns=feature_columns)

print(f"\n Best Hyperparameters: {best_params}"),
print(f"\n R2 on Train-set: {r2_train}"),
print(f"\n Root Mean Squared Error (RMSE) on Test Set: {rmse}")

In [None]:
xgb_path = "./model_folder/xgb.joblib"
save_file(best_model, xgb_path)