In [None]:
# Import packages
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
filepath = "Data/{filename}.csv" # TO UPDATE after feature engineering is complete

# Column names from feature engineered dataset
response_col_name = "" # Response col
cat_col_names = [] # Categorical col 
num_col_names = [] # Numerical col

In [None]:
def preprocess_dataframe(
        df : pd.DataFrame, 
        response_col_name : str,
        cat_col_names : list[str] = [], 
        num_col_names : list[str] = [],
        scale : bool = True,
        OHE : bool = True
        ) -> pd.DataFrame:
    '''For scaling and one-hot encoding dataframe'''
    
    y = df[response_col_name].values # Response column
    X_num = df[num_col_names].values # Numerical columns
    X_cat = df[cat_col_names].values # Categorical columns

    if scale:
        # Scale numerical features
        scaler = StandardScaler()
        X_num = scaler.fit_transform(X_num)

    if OHE:
        # One-hot encode categorical features
        encoder = OneHotEncoder(sparse = False, drop = 'first')  # drop = 'first' to avoid dummy variable trap
        X_cat = encoder.fit_transform(X_cat)
    
    # Combine the scaled numerical and encoded categorical features
    X_preprocessed = pd.DataFrame(
        data = np.hstack((X_num, X_cat)),  # Horizontal stack to combine arrays
        columns = num_col_names + list(encoder.get_feature_names_out(cat_col_names))
    )

    # Create a new DataFrame including the response variable
    preprocessed_df = pd.DataFrame(X_preprocessed, columns = X_preprocessed.columns)
    preprocessed_df[response_col_name] = y

    return preprocessed_df
    

In [None]:
def principal_component_analysis(
        df : pd.DataFrame, 
        response_col_name : str,
        cat_col_names : list[str] = [], 
        num_col_names : list[str] = [],
        var : float = 0.95, logs : bool = False
        ) -> pd.DataFrame:

    y = df[response_col_name].values # Response column
    X_num = df[num_col_names].values # Numerical columns
    X_cat = df[cat_col_names].values # Categorical columns

    pca = PCA(n_components = var) # Keep 'var' proportion of the variance : default 95%
    X_pca = pca.fit_transform(X_num) 

    pca_columns = [f'PC{i+1}' for i in range(X_pca.shape[1])]
    df_pca = pd.DataFrame(X_pca, columns = pca_columns)

    df_modified = pd.concat(
        [
            df_pca, 
            pd.DataFrame(X_cat, columns = cat_col_names), 
            pd.Series(y, name = response_col_name)
        ], 
        axis = 1)
    
    if logs:
        print(f"Number of components selected: {pca.n_components_}")
        print("Explained variance ratio for each component:", pca.explained_variance_ratio_)
        print("Cumulative explained variance:", pca.explained_variance_ratio_.cumsum())
        print("Final DataFrame with PCA applied to numeric columns:")
        print(df_modified.head())

    return df_modified