In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
file_path = "./final_data_to_train_dss.csv"
with open(file_path, "r") as data:
    df = pd.read_csv(data)
    df = df.drop("Unnamed: 0", axis="columns")
df.shape

(396, 33)

In [3]:
df = df.sort_values(by=["Company name", "time"])
df.head()

Unnamed: 0,Company name,TSR,PRASM,RASM,CASM,Load_factor,Gross_profit_margin,Quick_ratio,D/E,ROA,...,Quick_ratio_lag1,Quick_ratio_lag2,D/E_lag1,D/E_lag2,ROA_lag1,ROA_lag2,EPS_lag1,EPS_lag2,TSR_rolling_mean,time
0,0,0.161,11.5,14.01,0.113,0.856,0.2384,1.11,0.7,0.0567,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013.25
1,0,0.156,12.0,14.56,0.114,0.85,0.2622,1.1,0.678,0.062,...,1.11,0.0,0.7,0.0,0.0567,0.0,0.62,0.0,0.0,2013.5
2,0,0.096,12.5,15.1,0.1249,0.854,0.3065,1.14,0.69,0.082,...,1.1,1.11,0.678,0.7,0.062,0.0567,0.75,0.62,0.0,2013.75
3,0,0.165,13.0,15.0,0.1302,0.856,0.2345,1.08,0.74,0.0876,...,1.14,1.1,0.69,0.678,0.082,0.062,2.07,0.75,0.1445,2014.0
4,0,0.181,13.82,14.5,0.135,0.815,0.0769,1.07,0.74,0.0958,...,1.08,1.14,0.74,0.69,0.0876,0.082,1.11,2.07,0.1495,2014.25


In [24]:
df.columns

Index(['Company name', 'TSR', 'PRASM', 'RASM', 'CASM', 'Load_factor',
       'Gross_profit_margin', 'Quick_ratio', 'D/E', 'ROA', 'EPS', 'TSR_lag1',
       'TSR_lag2', 'PRASM_lag1', 'PRASM_lag2', 'RASM_lag1', 'RASM_lag2',
       'CASM_lag1', 'CASM_lag2', 'Load_factor_lag1', 'Load_factor_lag2',
       'Gross_profit_margin_lag1', 'Gross_profit_margin_lag2',
       'Quick_ratio_lag1', 'Quick_ratio_lag2', 'D/E_lag1', 'D/E_lag2',
       'ROA_lag1', 'ROA_lag2', 'EPS_lag1', 'EPS_lag2', 'TSR_rolling_mean',
       'time'],
      dtype='object')

In [13]:
def split_data(df, year_split=2022):
    """
    Split the data into training and test sets based on the year column.
    
    Parameters:
    - df: DataFrame containing the full dataset.
    - target_column: The name of the target variable column.
    - year_split: The year to split on (default is 2022).
    
    Returns:
    - train_data: DataFrame containing the training data.
    - test_data: DataFrame containing the test data.
    """
    train_data = df[df['time'] <= year_split]
    test_data = df[df['time'] > year_split]
    
    return train_data, test_data

In [14]:
train_data, test_data = split_data(df, year_split=2022)

In [None]:
from sklearn.preprocessing import MinMaxScaler
# using minmax_scaler since to keep the most of the pattern in the dataframe
def scaling_data(df):
    scaler=MinMaxScaler()
    scaler_object=scaler.fit(df)
    return scaler_object.transform(df), scaler_object

In [16]:
minmax_scaled, scaler_object = scaling_data(df)

In [17]:
minmax_scaled

array([[0.        , 0.86359191, 0.27770617, ..., 0.52867689, 0.85635166,
        0.        ],
       [0.        , 0.86350759, 0.289866  , ..., 0.52867689, 0.85635166,
        0.02325581],
       [0.        , 0.86249578, 0.30202583, ..., 0.56388416, 0.85635166,
        0.04651163],
       ...,
       [1.        , 0.8608769 , 0.87596975, ..., 0.8733674 , 0.85527629,
        0.95348837],
       [1.        , 0.8608769 , 0.91731316, ..., 0.49176604, 0.85799416,
        0.97674419],
       [1.        , 0.8608769 , 1.        , ..., 0.73367405, 0.85635166,
        1.        ]])

In [19]:
scaled_df = pd.DataFrame(minmax_scaled, columns=df.columns)

In [20]:
scaled_df.head()

Unnamed: 0,Company name,TSR,PRASM,RASM,CASM,Load_factor,Gross_profit_margin,Quick_ratio,D/E,ROA,...,Quick_ratio_lag1,Quick_ratio_lag2,D/E_lag1,D/E_lag2,ROA_lag1,ROA_lag2,EPS_lag1,EPS_lag2,TSR_rolling_mean,time
0,0.0,0.863592,0.277706,0.237569,0.000459,0.00819,0.363186,0.192042,0.570073,0.697485,...,0.0,0.0,0.530988,0.530988,0.478143,0.478143,0.528677,0.528677,0.856352,0.0
1,0.0,0.863508,0.289866,0.246988,0.00049,0.008115,0.367616,0.190311,0.568844,0.717988,...,0.192042,0.0,0.570073,0.530988,0.697485,0.478143,0.563884,0.528677,0.856352,0.023256
2,0.0,0.862496,0.302026,0.256236,0.000823,0.008165,0.375861,0.197232,0.569514,0.795358,...,0.190311,0.192042,0.568844,0.570073,0.717988,0.697485,0.571266,0.563884,0.856352,0.046512
3,0.0,0.863659,0.314186,0.254524,0.000985,0.00819,0.36246,0.186851,0.572306,0.817021,...,0.197232,0.190311,0.569514,0.568844,0.795358,0.717988,0.646224,0.571266,0.866079,0.069767
4,0.0,0.863929,0.334128,0.245961,0.001132,0.007678,0.333129,0.185121,0.572306,0.848743,...,0.186851,0.197232,0.572306,0.569514,0.817021,0.795358,0.591709,0.646224,0.866415,0.093023
