### Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)
import matplotlib.pyplot as plt
import seaborn as sns
import os

### Getting basic information

In [2]:
print(f"List all the file: {os.listdir()}")

List all the file: ['exploratory_data_analytics.ipynb', "[DSS-GROUP 1] Team-management - List of companies' data to be collected.csv"]


In [3]:
filepath = f"./[DSS-GROUP 1] Team-management - List of companies' data to be collected.csv"
with open(filepath, 'r') as opened_file:
    df = pd.read_csv(opened_file)

In [4]:
print("Original data's first 5 rows: ")
df.head()

Original data's first 5 rows: 


Unnamed: 0,Company name,Year,Quarter,TSR,PRASM,RASM,CASM,Load_factor,Gross_profit_margin,Quick_ratio,D/E,ROA,EPS
0,US_American Airlines,2013,1,0.04,13.7,13.52,14.5,0.728,0.26,0.73,2.23,0.001025,0.9
1,US_American Airlines,2013,2,0.1154,13.2,14.21,14.89,0.847,0.113,0.71,4.43,0.0093,0.44
2,US_American Airlines,2013,3,0.0345,10.8,15.97,12.94,0.855,0.1621,0.47,4.1,0.012,0.76
3,US_American Airlines,2013,4,0.0667,13.46,16.14,15.93,0.844,0.18,0.52,3.82,0.0089,0.5
4,US_American Airlines,2014,1,0.0938,13.67,58.53,13.5,0.848,0.0731,1.01,0.35,0.011,0.54


### Exploratory data analysis

In [5]:
print("Data' overall information:")
df.info()

Data' overall information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396 entries, 0 to 395
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company name         396 non-null    object 
 1   Year                 396 non-null    int64  
 2   Quarter              396 non-null    int64  
 3   TSR                  396 non-null    float64
 4   PRASM                396 non-null    float64
 5   RASM                 396 non-null    float64
 6   CASM                 396 non-null    float64
 7   Load_factor          396 non-null    float64
 8   Gross_profit_margin  396 non-null    float64
 9   Quick_ratio          396 non-null    float64
 10  D/E                  396 non-null    float64
 11  ROA                  396 non-null    float64
 12  EPS                  396 non-null    float64
dtypes: float64(10), int64(2), object(1)
memory usage: 40.3+ KB


1. Company name: Dimension of Location (Indvidual entity)
2. Year: Dimension of Time
3. Quarter: Dimension of Time

Sort data by the increasing dimension of time

In [6]:
df = df.sort_values(by=["Company name", "Year", "Quarter"])

### Feature engineering

In [7]:
# define the list of column of lagging:
lagged_features = ['TSR', 'PRASM', 'RASM', 'CASM', 'Load_factor', 'Gross_profit_margin', 'Quick_ratio', 'D/E', 'ROA', 'EPS']

for feature in lagged_features:
    df[f'{feature}_lag1'] = df.groupby('Company name')[feature].shift(1)  # Lag1: Previous quarter for the same company
    df[f'{feature}_lag2'] = df.groupby('Company name')[feature].shift(2)  # Optional: Lag2: Two quarters ago for the same company


In [8]:
# rolling mean for each company, past 4 quarters
df['TSR_rolling_mean'] = df.groupby('Company name')['TSR'].rolling(window=4).mean().reset_index(0, drop=True)

In [9]:
# fill all the nulls value with the 0 value
df.fillna(0, inplace=True)

In [10]:
# create a time feature: Year + (Quarter / 4) as a continuous value
df['time'] = df['Year'] + (df['Quarter'] / 4)

In [11]:
df.head()

Unnamed: 0,Company name,Year,Quarter,TSR,PRASM,RASM,CASM,Load_factor,Gross_profit_margin,Quick_ratio,D/E,ROA,EPS,TSR_lag1,TSR_lag2,PRASM_lag1,PRASM_lag2,RASM_lag1,RASM_lag2,CASM_lag1,CASM_lag2,Load_factor_lag1,Load_factor_lag2,Gross_profit_margin_lag1,Gross_profit_margin_lag2,Quick_ratio_lag1,Quick_ratio_lag2,D/E_lag1,D/E_lag2,ROA_lag1,ROA_lag2,EPS_lag1,EPS_lag2,TSR_rolling_mean,time
176,US_Alaska,2013,1,0.161,11.5,14.01,0.113,0.856,0.2384,1.11,0.7,0.0567,0.62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013.25
177,US_Alaska,2013,2,0.156,12.0,14.56,0.114,0.85,0.2622,1.1,0.678,0.062,0.75,0.161,0.0,11.5,0.0,14.01,0.0,0.113,0.0,0.856,0.0,0.2384,0.0,1.11,0.0,0.7,0.0,0.0567,0.0,0.62,0.0,0.0,2013.5
178,US_Alaska,2013,3,0.096,12.5,15.1,0.1249,0.854,0.3065,1.14,0.69,0.082,2.07,0.156,0.161,12.0,11.5,14.56,14.01,0.114,0.113,0.85,0.856,0.2622,0.2384,1.1,1.11,0.678,0.7,0.062,0.0567,0.75,0.62,0.0,2013.75
179,US_Alaska,2013,4,0.165,13.0,15.0,0.1302,0.856,0.2345,1.08,0.74,0.0876,1.11,0.096,0.156,12.5,12.0,15.1,14.56,0.1249,0.114,0.854,0.85,0.3065,0.2622,1.14,1.1,0.69,0.678,0.082,0.062,2.07,0.75,0.1445,2014.0
180,US_Alaska,2014,1,0.181,13.82,14.5,0.135,0.815,0.0769,1.07,0.74,0.0958,1.28,0.165,0.096,13.0,12.5,15.0,15.1,0.1302,0.1249,0.856,0.854,0.2345,0.3065,1.08,1.14,0.74,0.69,0.0876,0.082,1.11,2.07,0.1495,2014.25


In [None]:
from sklearn.preprocessing import LabelEncoder

def label_encode(df):
    """
    Preprocess the DataFrame by encoding categorical variables before splitting into training and test sets.
    
    Parameters:
    - df: DataFrame containing the full dataset.
    
    Returns:
    - df_encoded: The DataFrame with categorical columns encoded.
    """
    
    # Identify categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    
    # Label Encoding for ordinal categorical variables or One-Hot Encoding for nominal ones
    df_encoded = df.copy()
    
    for col in categorical_columns:
        # If the column is a categorical feature, apply Label Encoding or One-Hot Encoding
        df_encoded[col] = df_encoded[col].astype(str)  # Ensure string type for encoding
        
        # Apply Label Encoding (you can use OneHotEncoder if needed for non-ordinal data)
        label_encoder = LabelEncoder()
        df_encoded[col] = label_encoder.fit_transform(df_encoded[col])
    
    return df_encoded

In [22]:
df_encoded = label_encode(df)

### Train-test-split, data modelling

In [25]:
def split_data(df, year_split=2022):
    """
    Split the data into training and test sets based on the year column.
    
    Parameters:
    - df: DataFrame containing the full dataset.
    - target_column: The name of the target variable column.
    - year_split: The year to split on (default is 2022).
    
    Returns:
    - train_data: DataFrame containing the training data.
    - test_data: DataFrame containing the test data.
    """
    train_data = df[df['Year'] <= year_split]
    test_data = df[df['Year'] > year_split]
    
    return train_data, test_data

In [36]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

def train_random_forest(train_data, target_column, feature_columns, n_splits=5, scoring='neg_root_mean_squared_error'):
    """
    Train a Random Forest model using GridSearchCV with time-series cross-validation.
    
    Parameters:
    - train_data: DataFrame containing the training data.
    - target_column: The name of the target variable column (dependent variable).
    - feature_columns: List of column names to be used as features (independent variables).
    - n_splits: Number of splits for time-series cross-validation (default is 5).
    - scoring: The scoring metric for GridSearchCV (default is RMSE).
    
    Returns:
    - best_model: The best trained model after hyperparameter tuning.
    - best_params: The best set of hyperparameters found.
    - r2_train: R² score on the training set.
    """
    
    # Define the hyperparameter grid for Random Forest inside the function
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    }
    
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]
    
    rf = RandomForestRegressor(random_state=42)
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=tscv, scoring=scoring, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # Calculate R² on the training data
    y_train_pred = best_model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    return best_model, best_params, r2_train

In [None]:
from sklearn.metrics import mean_squared_error

def evaluate_model(model, test_data, target_column, feature_columns):
    """
    Evaluate the model using RMSE and R² on the test data.
    
    Parameters:
    - model: The trained model to be evaluated.
    - test_data: DataFrame containing the test data.
    - target_column: The name of the target variable column.
    - feature_columns: List of column names to be used as features.
    
    Returns:
    - rmse: Root Mean Squared Error (RMSE) on the test set.
    - r2: R² score on the test set.
    """
    X_test = test_data[feature_columns]
    y_test = test_data[target_column]
    
    y_pred = model.predict(X_test)
    
    # Calculate RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # Calculate R² score
    # r2 = r2_score(y_test, y_pred)
    
    return rmse

In [39]:
# Step 1: Split the data into the training data, and testing data:
train_data, test_data = split_data(df_encoded)

# Step 2: 
feature_columns = [col for col in df.columns if col != "TSR"]

# Step 3: Train the model with GridSearchCV
best_model, best_params, r2_train = train_random_forest(train_data, target_column="TSR", feature_columns=feature_columns)

# Step 4: Evaluate the model
rmse = evaluate_model(best_model, test_data, target_column="TSR", feature_columns=feature_columns)

# Output the results
print(f"\n Best Hyperparameters: {best_params}")
print(f"\n R2 on Train-set: {r2_train}")
print(f"\n Root Mean Squared Error (RMSE) on Test Set: {rmse}")

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=None,



In [41]:
import joblib
def dump_model(best_model, model_path):
   joblib.dump(best_model, model_path)

In [42]:
model_path = "./model_folder/random_forest_model.joblib"
dump_model(best_model, model_path)