In [None]:
import pandas as pd
import os

In [2]:
os.chdir("..") # go to the parent folder
file_path = "./final_data"
df = pd.read_csv("./final_data.csv")

In [4]:
def split_data(df, year_split=2022):
    """
    Split the data into training and test sets based on the year column.
    
    Parameters:
    - df: DataFrame containing the full dataset.
    - target_column: The name of the target variable column.
    - year_split: The year to split on (default is 2022).
    
    Returns:
    - train_data: DataFrame containing the training data.
    - test_data: DataFrame containing the test data.
    """
    train_data = df[df['Time'] <= year_split]
    test_data = df[df['Time'] > year_split]
    
    return train_data, test_data

In [5]:
train_data, test_data = split_data(df)
print(f"This is the train data shape: {train_data.shape}")
print(f"This is the test data shape: {test_data.shape}")

This is the train data shape: (324, 33)
This is the test data shape: (72, 33)


In [6]:
scaler_df = train_data.drop(['Company name', 'TSR', 'Time'], axis=1)

In [11]:
from sklearn.preprocessing import MinMaxScaler

def minmax_scaler(df):
    scaler = MinMaxScaler()
    scaled_object = scaler.fit(df)
    scaled_array = scaled_object.transform(df)
    columns_name = df.columns
    scaled_df = pd.DataFrame(scaled_array, columns=columns_name)
    return scaled_df, scaled_object

In [12]:
from sklearn.preprocessing import StandardScaler

def standard_scaler(df):
    scaler = StandardScaler()
    scaled_object = scaler.fit(df)
    scaled_array = scaled_object.transform(df)
    columns_name = df.columns
    scaled_df = pd.DataFrame(scaled_array, columns=columns_name)
    return scaled_df, scaled_object

In [14]:
import joblib
def save_file(scaler, filename):
  joblib.dump(scaler, filename)
  return f"Saving {filename} successfully executed"

In [15]:
# save minmax scaler:
minmax_scaled_data, minmax_scaler_object = minmax_scaler(scaler_df)
min_max_path = "./model_building/scaler_folder/minmax_scaler.joblib"

# dumping file:
save_file(minmax_scaler_object, min_max_path)

'Saving ./model_building/scaler_folder/minmax_scaler.joblib successfully executed'

In [16]:
minmax_scaled_data

Unnamed: 0,Engineered_PRASM,Engineered_RASM,Engineered_CASM,Engineered_Load_factor,Engineered_Gross_profit_margin,Engineered_Quick_ratio,Engineered_D/E,Engineered_ROA,Engineered_EPS,TSR_lag1,...,Engineered_Gross_profit_margin_lag2,Engineered_Quick_ratio_lag1,Engineered_Quick_ratio_lag2,Engineered_D/E_lag1,Engineered_D/E_lag2,Engineered_ROA_lag1,Engineered_ROA_lag2,Engineered_EPS_lag1,Engineered_EPS_lag2,TSR_rolling_mean
0,0.322983,0.235652,0.000452,0.008172,0.859217,0.191851,0.569805,0.697455,0.564614,0.860877,...,0.754579,0.000000,0.000000,0.530865,0.530865,0.478699,0.478699,0.529480,0.529480,0.856352
1,0.337168,0.245119,0.000483,0.008098,0.869678,0.190146,0.568586,0.717933,0.571986,0.863592,...,0.754579,0.191851,0.000000,0.569805,0.530865,0.697455,0.478699,0.564614,0.529480,0.856352
2,0.351356,0.254418,0.000816,0.008149,0.889141,0.197085,0.569258,0.795144,0.646812,0.863508,...,0.859217,0.190146,0.191851,0.568586,0.569805,0.717933,0.697455,0.571986,0.564614,0.856352
3,0.365548,0.252733,0.000978,0.008176,0.857544,0.186735,0.572045,0.816797,0.592405,0.862496,...,0.869678,0.197085,0.190146,0.569258,0.568586,0.795144,0.717933,0.646812,0.571986,0.866079
4,0.388800,0.244182,0.001125,0.007665,0.788349,0.185029,0.572050,0.848491,0.602051,0.863659,...,0.889141,0.186735,0.197085,0.572045,0.569258,0.816797,0.795144,0.592405,0.646812,0.866415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319,0.241608,0.176123,0.538815,0.004457,0.576616,0.189119,0.763728,0.000000,0.165977,0.863470,...,0.000000,0.170013,0.095404,0.749182,0.655363,0.161155,0.357505,0.065346,0.000000,0.847861
320,0.298718,0.310505,0.431977,0.005009,0.824160,0.194349,0.719636,0.393483,0.245013,0.862002,...,0.406936,0.189119,0.170013,0.763728,0.749182,0.000000,0.161155,0.165977,0.065346,0.858469
321,0.182309,0.195118,0.447349,0.005636,0.890805,0.258585,0.719101,0.452357,0.441853,0.865182,...,0.576616,0.194349,0.189119,0.719636,0.763728,0.393483,0.000000,0.245013,0.165977,0.865520
322,0.273228,0.245109,0.432085,0.009644,0.895847,0.244731,0.716331,0.369828,0.163566,0.862040,...,0.824160,0.258585,0.194349,0.719101,0.719636,0.452357,0.393483,0.441853,0.245013,0.860217


In [17]:
# save minmax scaler:
standard_scaled_data, standard_scaler_object = minmax_scaler(scaler_df)
standard_path = "./model_building/scaler_folder/minmax_scaler.joblib"

# dumping file:
save_file(standard_scaler_object, standard_path)

'Saving ./model_building/scaler_folder/minmax_scaler.joblib successfully executed'