In [None]:
import pandas as pd
import os

In [None]:
file_path = "../final_data"
df = pd.read_csv(file_path)

In [4]:
def split_data(df, year_split=2022):
    """
    Split the data into training and test sets based on the year column.
    
    Parameters:
    - df: DataFrame containing the full dataset.
    - target_column: The name of the target variable column.
    - year_split: The year to split on (default is 2022).
    
    Returns:
    - train_data: DataFrame containing the training data.
    - test_data: DataFrame containing the test data.
    """
    train_data = df[df['Time'] <= year_split]
    test_data = df[df['Time'] > year_split]
    
    return train_data, test_data

In [5]:
train_data, test_data = split_data(df)
print(f"This is the train data shape: {train_data.shape}")
print(f"This is the test data shape: {test_data.shape}")

This is the train data shape: (324, 33)
This is the test data shape: (72, 33)


In [6]:
scaler_df = train_data.drop(['Company name', 'TSR', 'Time'], axis=1)

In [11]:
from sklearn.preprocessing import MinMaxScaler

def minmax_scaler(df):
    scaler = MinMaxScaler()
    scaled_object = scaler.fit(df)
    scaled_array = scaled_object.transform(df)
    columns_name = df.columns
    scaled_df = pd.DataFrame(scaled_array, columns=columns_name)
    return scaled_df, scaled_object

In [12]:
from sklearn.preprocessing import StandardScaler

def standard_scaler(df):
    scaler = StandardScaler()
    scaled_object = scaler.fit(df)
    scaled_array = scaled_object.transform(df)
    columns_name = df.columns
    scaled_df = pd.DataFrame(scaled_array, columns=columns_name)
    return scaled_df, scaled_object

In [14]:
import joblib
def save_file(scaler, filename):
  joblib.dump(scaler, filename)
  return f"Saving {filename} successfully executed"

In [15]:
# save minmax scaler:
minmax_scaled_data, minmax_scaler_object = minmax_scaler(scaler_df)
min_max_path = "./model_building/scaler_folder/minmax_scaler.joblib"

# dumping file:
save_file(minmax_scaler_object, min_max_path)

'Saving ./model_building/scaler_folder/minmax_scaler.joblib successfully executed'

In [17]:
# save minmax scaler:
standard_scaled_data, standard_scaler_object = minmax_scaler(scaler_df)
standard_path = "./model_building/scaler_folder/minmax_scaler.joblib"

# dumping file:
save_file(standard_scaler_object, standard_path)

'Saving ./model_building/scaler_folder/minmax_scaler.joblib successfully executed'

In [19]:
print(f"minmax_scaled_data's shape: {minmax_scaled_data.shape}")
print(f"standard_scaled_data's shape: {standard_scaled_data.shape}")

minmax_scaled_data's shape: (324, 30)
standard_scaled_data's shape: (324, 30)
