In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from pandas.api.types import is_numeric_dtype

def one_hot_encode_columns(df, columns):
    """ One-hot encode specified columns in the DataFrame and return the indices of new columns. """
    original_columns = set(df.columns)
    
    for column in columns:
        # Get one-hot encoded DataFrame for the current column
        encoded = pd.get_dummies(df[column], prefix=column)

        # Concatenate with original DataFrame and drop the original column
        df = pd.concat([df, encoded], axis=1).drop(column, axis=1)
    
    # Determine new columns added
    new_columns = set(df.columns) - original_columns

    # Determine indices of the new columns
    new_column_indices = [df.columns.get_loc(c) for c in new_columns]

    return df, new_column_indices

def scale_columns(df, scaler, columns):
    """ Scale specified columns using the given scaler """
    df[columns] = scaler.fit_transform(df[columns])
    return df

# Function to calculate the ratio of the most common value
def most_common_ratio(series):
    return series.value_counts(normalize=True).iloc[0]


def feature_engineering_C_part1(df, target_columns):
    columns_to_one_hot_encode = ["architectural_archetype", "stories", "soil_class", "seismic_zone", "connection_system", 'Story', 'Direction', 'Wall']
    columns_to_scale = ['L cm', 'xi cm', 'yi cm', 'D+0.25L', 'Story Area']
    
    target_data = df[target_columns]
    df = df.drop(columns=target_columns)
    
    # One-hot encode specified columns
    df,new_column_indices = one_hot_encode_columns(df, columns_to_one_hot_encode)

    for col in df.columns:
        if not is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Scale specified columns
    scaler = MinMaxScaler()
    df = scale_columns(df, scaler, columns_to_scale)
    
    #Removes columns with zero variance
    df = df.loc[:, df.var() != 0]
    
    df = pd.concat([df, target_data], axis=1)
    return df

def feature_engineering_C_part2(df, target_columns):
    columns_to_one_hot_encode = ["architectural_archetype", "stories", "soil_class", "seismic_zone", "connection_system"]
    
    target_data = df[target_columns]
    df = df.drop(columns=target_columns)
    
    # One-hot encode specified columns
    df, new_column_indices = one_hot_encode_columns(df, columns_to_one_hot_encode)
    
    indicator_dict = {}

    # Creating indicator variables only for columns with NaN values
    for col in df.columns:
        if df[col].isna().any():
            indicator_col_name = f"{col}_present"
            indicator_dict[indicator_col_name] = df[col].notna().astype(int)

    # Create a DataFrame from the dictionary and concatenate it
    indicator_df = pd.DataFrame(indicator_dict)
    df = pd.concat([df, indicator_df], axis=1)
    
    # Custom Imputation
    for col in df.columns:
        if df[col].isna().mean() > 0.40:
            df[col].fillna(0, inplace=True)
        else:
            if is_numeric_dtype(df[col]):
                df[col].fillna(df[col].median(), inplace=True)

    # Ensuring all columns are numeric after imputation
    for col in df.columns:
        if not is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')

    all_columns = set(df.columns)
    non_scale_columns = set(columns_to_one_hot_encode)
    columns_to_scale = list(all_columns - non_scale_columns) 
    
    # Scale specified columns
    scaler = MinMaxScaler()
    df = scale_columns(df, scaler, columns_to_scale)
    
    #Removes columns with zero variance
    df = df.loc[:, df.var() != 0]
    
    # Removing highly correlated columns
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find index of feature columns with correlation greater than 0.90
    to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
    df = df.drop(columns=to_drop)
    
    # Threshold for dropping columns
    threshold = 0.95

    # Apply the function and drop columns based on the threshold
    cols_to_drop = [col for col in df.columns if most_common_ratio(df[col]) > threshold]
    df_dropped = df.drop(columns=cols_to_drop)
    
    df = pd.concat([df_dropped, target_data], axis=1)
    
    return df

def feature_engineering_D(df, target_columns):
    columns_to_one_hot_encode = ["architectural_archetype", "stories", "soil_class", "seismic_zone", "connection_system"]
    
    target_data = df[target_columns]
    df = df.drop(columns=target_columns)
    
    # One-hot encode specified columns
    df, new_column_indices = one_hot_encode_columns(df, columns_to_one_hot_encode)
    
    indicator_dict = {}

    # Creating indicator variables only for columns with NaN values
    for col in df.columns:
        if df[col].isna().any():
            indicator_col_name = f"{col}_present"
            indicator_dict[indicator_col_name] = df[col].notna().astype(int)

    # Create a DataFrame from the dictionary and concatenate it
    indicator_df = pd.DataFrame(indicator_dict)
    df = pd.concat([df, indicator_df], axis=1)
    
    # Custom Imputation
    for col in df.columns:
        if df[col].isna().mean() > 0.40:
            df[col].fillna(0, inplace=True)
        else:
            if is_numeric_dtype(df[col]):
                df[col].fillna(df[col].median(), inplace=True)

    # Ensuring all columns are numeric after imputation
    for col in df.columns:
        if not is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')

    all_columns = set(df.columns)
    non_scale_columns = set(columns_to_one_hot_encode)
    columns_to_scale = list(all_columns - non_scale_columns) 
    
    # Scale specified columns
    scaler = MinMaxScaler()
    df = scale_columns(df, scaler, columns_to_scale)
    
    #Removes columns with zero variance
    df = df.loc[:, df.var() != 0]
    
    # Removing highly correlated columns
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find index of feature columns with correlation greater than 0.90
    to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
    df = df.drop(columns=to_drop)
    
    # Threshold for dropping columns
    threshold = 0.95

    # Apply the function and drop columns based on the threshold
    cols_to_drop = [col for col in df.columns if most_common_ratio(df[col]) > threshold]
    df_dropped = df.drop(columns=cols_to_drop)
    
    df = pd.concat([df_dropped, target_data], axis=1)
    
    return df

#Do Feature Engineering for C part 1
path ='./Files/Before_Feature_Engineering'
path_FE ='Files/After_Feature_Engineering/'
file_path_C_part1 = path + '/data_C_part1.csv'
df_C_part1 = pd.read_csv(file_path_C_part1, low_memory=False)
target_column_C = ["Nail spacing [cm]", "Number sheathing panels", "Number end studs", "Total number studs","HoldDown Model / ATS"]
df_C_part1 = feature_engineering_C_part1(df_C_part1, target_column_C) 

prepared_file_path_C_part1 = path_FE + 'data_C_part1_FE.csv'
df_C_part1.to_csv(prepared_file_path_C_part1, index=False)


#Do Feature Engineering for C part 2
file_path_C_part2 = path +'/data_C_part2.csv'
df_C_part2 = pd.read_csv(file_path_C_part2, low_memory=False)
target_column_C_2 = ['Tx(s)', 'Ty(s)']
df_C_part2 = feature_engineering_C_part2(df_C_part2, target_column_C_2) 

prepared_file_path_C_part2 = path_FE + 'data_C_part2_FE.csv'
df_C_part2.to_csv(prepared_file_path_C_part2, index=False)

#Do Feature Engineering for D
file_path_D = path +'/data_D.csv'
df_D = pd.read_csv(file_path_D, low_memory=False)
target_column_D = ['Ωx', 'Ωy', 'µx', 'µy', 'CMR', 'SSF', 'ACMR', 'IO-ln θ','IO-β','LS-ln θ','LS-β', 'CP-ln θ','CP-β']
df_D = feature_engineering_D(df_D, target_column_D) 

prepared_file_path_D = path_FE + 'data_D_FE.csv'
df_D.to_csv(prepared_file_path_D, index=False)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [None]:
df_C_part1

In [None]:
df_C_part2

In [None]:
df_D