In [1]:
import pandas as pd

df = pd.read_csv('processed_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Company name,Year,Quarter,TSR,PRASM,RASM,CASM,Load_factor,Gross_profit_margin,...,Quick_ratio_lag1,Quick_ratio_lag2,D/E_lag1,D/E_lag2,ROA_lag1,ROA_lag2,EPS_lag1,EPS_lag2,TSR_rolling_mean,time
0,176,US_Alaska,2013,1,0.161,11.5,14.01,0.113,0.856,0.2384,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013.25
1,177,US_Alaska,2013,2,0.156,12.0,14.56,0.114,0.85,0.2622,...,1.11,0.0,0.7,0.0,0.0567,0.0,0.62,0.0,0.0,2013.5
2,178,US_Alaska,2013,3,0.096,12.5,15.1,0.1249,0.854,0.3065,...,1.1,1.11,0.678,0.7,0.062,0.0567,0.75,0.62,0.0,2013.75
3,179,US_Alaska,2013,4,0.165,13.0,15.0,0.1302,0.856,0.2345,...,1.14,1.1,0.69,0.678,0.082,0.062,2.07,0.75,0.1445,2014.0
4,180,US_Alaska,2014,1,0.181,13.82,14.5,0.135,0.815,0.0769,...,1.08,1.14,0.74,0.69,0.0876,0.082,1.11,2.07,0.1495,2014.25


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396 entries, 0 to 395
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                396 non-null    int64  
 1   Company name              396 non-null    object 
 2   Year                      396 non-null    int64  
 3   Quarter                   396 non-null    int64  
 4   TSR                       396 non-null    float64
 5   PRASM                     396 non-null    float64
 6   RASM                      396 non-null    float64
 7   CASM                      396 non-null    float64
 8   Load_factor               396 non-null    float64
 9   Gross_profit_margin       396 non-null    float64
 10  Quick_ratio               396 non-null    float64
 11  D/E                       396 non-null    float64
 12  ROA                       396 non-null    float64
 13  EPS                       396 non-null    float64
 14  TSR_lag1  

In [4]:
from sklearn.preprocessing import LabelEncoder

def label_encode(df):
    """
    Preprocess the DataFrame by encoding categorical variables before splitting into training and test sets.
    
    Parameters:
    - df: DataFrame containing the full dataset.
    
    Returns:
    - df_encoded: The DataFrame with categorical columns encoded.
    """
    
    # Identify categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    
    # Label Encoding for ordinal categorical variables or One-Hot Encoding for nominal ones
    df_encoded = df.copy()
    
    for col in categorical_columns:
        # If the column is a categorical feature, apply Label Encoding or One-Hot Encoding
        df_encoded[col] = df_encoded[col].astype(str)  # Ensure string type for encoding
        
        # Apply Label Encoding (you can use OneHotEncoder if needed for non-ordinal data)
        label_encoder = LabelEncoder()
        df_encoded[col] = label_encoder.fit_transform(df_encoded[col])
    
    return df_encoded

In [5]:
df_encoded = label_encode(df)
df_encoded.head()

Unnamed: 0.1,Unnamed: 0,Company name,Year,Quarter,TSR,PRASM,RASM,CASM,Load_factor,Gross_profit_margin,...,Quick_ratio_lag1,Quick_ratio_lag2,D/E_lag1,D/E_lag2,ROA_lag1,ROA_lag2,EPS_lag1,EPS_lag2,TSR_rolling_mean,time
0,176,0,2013,1,0.161,11.5,14.01,0.113,0.856,0.2384,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013.25
1,177,0,2013,2,0.156,12.0,14.56,0.114,0.85,0.2622,...,1.11,0.0,0.7,0.0,0.0567,0.0,0.62,0.0,0.0,2013.5
2,178,0,2013,3,0.096,12.5,15.1,0.1249,0.854,0.3065,...,1.1,1.11,0.678,0.7,0.062,0.0567,0.75,0.62,0.0,2013.75
3,179,0,2013,4,0.165,13.0,15.0,0.1302,0.856,0.2345,...,1.14,1.1,0.69,0.678,0.082,0.062,2.07,0.75,0.1445,2014.0
4,180,0,2014,1,0.181,13.82,14.5,0.135,0.815,0.0769,...,1.08,1.14,0.74,0.69,0.0876,0.082,1.11,2.07,0.1495,2014.25


In [6]:
def split_data(df, year_split=2022):
    """
    Split the data into training and test sets based on the year column.
    
    Parameters:
    - df: DataFrame containing the full dataset.
    - target_column: The name of the target variable column.
    - year_split: The year to split on (default is 2022).
    
    Returns:
    - train_data: DataFrame containing the training data.
    - test_data: DataFrame containing the test data.
    """
    train_data = df[df['Year'] <= year_split]
    test_data = df[df['Year'] > year_split]
    
    return train_data, test_data

In [12]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.svm import SVR
from sklearn.metrics import r2_score

def train_svr(train_data, target_column, feature_columns, n_splits=5, scoring='neg_root_mean_squared_error'):
    """
    Train a Random Forest model using GridSearchCV with time-series cross-validation.
    
    Parameters:
    - train_data: DataFrame containing the training data.
    - target_column: The name of the target variable column (dependent variable).
    - feature_columns: List of column names to be used as features (independent variables).
    - n_splits: Number of splits for time-series cross-validation (default is 5).
    - scoring: The scoring metric for GridSearchCV (default is RMSE).
    
    Returns:
    - best_model: The best trained model after hyperparameter tuning.
    - best_params: The best set of hyperparameters found.
    - r2_train: R² score on the training set.
    """
    param_grid = {
    'kernel': ['rbf', 'linear'],  # Start with RBF and Linear
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 0.2, 0.5],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1]
    }
    
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]
    
    svr = SVR(random_state=42)
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=tscv, scoring=scoring, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # Calculate R² on the training data
    y_train_pred = best_model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    return best_model, best_params, r2_train

In [8]:
from sklearn.metrics import mean_squared_error

def evaluate_model(model, test_data, target_column, feature_columns):
    """
    Evaluate the model using RMSE and R² on the test data.
    
    Parameters:
    - model: The trained model to be evaluated.
    - test_data: DataFrame containing the test data.
    - target_column: The name of the target variable column.
    - feature_columns: List of column names to be used as features.
    
    Returns:
    - rmse: Root Mean Squared Error (RMSE) on the test set.
    - r2: R² score on the test set.
    """
    X_test = test_data[feature_columns]
    y_test = test_data[target_column]
    
    y_pred = model.predict(X_test)
    
    # Calculate RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # Calculate R² score
    # r2 = r2_score(y_test, y_pred)
    
    return rmse

In [9]:
train_data, test_data = split_data(df_encoded)
test_data

Unnamed: 0.1,Unnamed: 0,Company name,Year,Quarter,TSR,PRASM,RASM,CASM,Load_factor,Gross_profit_margin,...,Quick_ratio_lag1,Quick_ratio_lag2,D/E_lag1,D/E_lag2,ROA_lag1,ROA_lag2,EPS_lag1,EPS_lag2,TSR_rolling_mean,time
40,216,0,2023,1,0.1,14.0,15.4,15.0,0.8,0.375,...,1.35,1.36,0.67,0.67,0.0309,0.0295,6.0,5.6,0.039625,2023.25
41,217,0,2023,2,-0.045,15.5,16.8,14.2,0.86,0.381,...,1.34,1.35,0.68,0.67,0.0323,0.0309,6.4,6.0,0.0387,2023.5
42,218,0,2023,3,0.0654,16.0,17.3,14.0,0.875,0.3864,...,1.33,1.34,0.68,0.68,0.0337,0.0323,6.8,6.4,0.0401,2023.75
43,219,0,2023,4,0.0435,15.8,17.0,14.3,0.855,0.3913,...,1.33,1.33,0.69,0.68,0.0349,0.0337,7.2,6.8,0.040975,2024.0
84,392,1,2023,1,0.0408,8.8,13.25,11.25,0.8575,0.0408,...,1.33,1.35,1.12,1.15,0.012,0.013,0.8,0.85,0.04285,2023.25
85,393,1,2023,2,0.0393,8.85,13.3,11.3,0.8592,0.0393,...,1.3,1.33,1.1,1.12,0.011,0.012,0.75,0.8,0.04125,2023.5
86,394,1,2023,3,0.0379,8.83,13.28,11.28,0.8584,0.0379,...,1.28,1.3,1.08,1.1,0.01,0.011,0.7,0.75,0.03985,2023.75
87,395,1,2023,4,0.0366,8.9,13.35,11.35,0.8578,3.66,...,1.25,1.28,1.05,1.08,0.009,0.01,0.65,0.7,0.03865,2024.0
128,40,2,2023,1,0.079,14.2,15.4,16.7,0.818,0.4,...,1.75,1.28,1.5,1.4,-0.02,0.05,-2.0,6.0,0.124475,2023.25
129,41,2,2023,2,0.0553,14.0,16.3,16.57,0.8,0.2403,...,0.42,1.75,-8.07,1.5,0.0275,-0.02,-1.43,-2.0,0.1008,2023.5


In [13]:
# Step 2: 
feature_columns = [col for col in df.columns if col != "TSR"]

# Step 3: Train the model with GridSearchCV
best_model, best_params, r2_train = train_svr(train_data, target_column="TSR", feature_columns=feature_columns)

# Step 4: Evaluate the model
rmse = evaluate_model(best_model, test_data, target_column="TSR", feature_columns=feature_columns)

# Output the results
print(f"\n Best Hyperparameters: {best_params}")
print(f"\n R2 on Train-set: {r2_train}")
print(f"\n Root Mean Squared Error (RMSE) on Test Set: {rmse}")

Fitting 5 folds for each of 729 candidates, totalling 3645 fits

 Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 300, 'subsample': 1.0}

 R2 on Train-set: 0.9999989338282202

 Root Mean Squared Error (RMSE) on Test Set: 0.059915422398156906




In [14]:
import joblib
def dump_model(best_model, model_path):
   joblib.dump(best_model, model_path)

model_path = "../model_folder/gbr.joblib"
dump_model(best_model, model_path)