In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from pandas.api.types import is_numeric_dtype

def one_hot_encode_columns(df, columns):
    """ One-hot encode specified columns in the DataFrame and return the indices of new columns. """
    original_columns = set(df.columns)
    
    for column in columns:
        # Get one-hot encoded DataFrame for the current column
        encoded = pd.get_dummies(df[column], prefix=column)

        # Concatenate with original DataFrame and drop the original column
        df = pd.concat([df, encoded], axis=1).drop(column, axis=1)
    
    # Determine new columns added
    new_columns = set(df.columns) - original_columns

    # Determine indices of the new columns
    new_column_indices = [df.columns.get_loc(c) for c in new_columns]

    return df, new_column_indices

def scale_columns(df, scaler, columns):
    """ Scale specified columns using the given scaler """
    df[columns] = scaler.fit_transform(df[columns])
    return df

# Function to calculate the ratio of the most common value
def most_common_ratio(series):
    return series.value_counts(normalize=True).iloc[0]


def feature_engineering_C_part1(df, target_columns):
    columns_to_one_hot_encode = ["architectural_archetype", "stories", "soil_class", "seismic_zone", "connection_system", 'Story', 'Direction', 'Wall']
    columns_to_scale = ['L cm', 'xi cm', 'yi cm', 'D+0.25L', 'Story Area']
    
    target_data = df[target_columns]
    df = df.drop(columns=target_columns)
    
    # One-hot encode specified columns
    df,new_column_indices = one_hot_encode_columns(df, columns_to_one_hot_encode)

    for col in df.columns:
        if not is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Scale specified columns
    scaler = MinMaxScaler()
    df = scale_columns(df, scaler, columns_to_scale)
    
    #Removes columns with zero variance
    df = df.loc[:, df.var() != 0]
    
    df = pd.concat([df, target_data], axis=1)
    return df

def feature_engineering_C_part2(df, target_columns):
    columns_to_one_hot_encode = ["architectural_archetype", "stories", "soil_class", "seismic_zone", "connection_system"]
    
    target_data = df[target_columns]
    df = df.drop(columns=target_columns)
    
    # One-hot encode specified columns
    df, new_column_indices = one_hot_encode_columns(df, columns_to_one_hot_encode)
    
    empty_columns = df.columns[df.isna().all()].tolist()
    df = df.drop(columns=empty_columns)
    non_empty_columns = df.columns[df.notna().any()].tolist()
    
    # Replace missing values (NaN) with zero
    imputer = SimpleImputer(strategy='constant', fill_value=0)
    df = imputer.fit_transform(df)
    df = pd.DataFrame(df, columns=non_empty_columns)
    
    for col in df.columns:
        if not is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    all_columns = set(df.columns)
    non_scale_columns = set(columns_to_one_hot_encode)
    columns_to_scale = list(all_columns - non_scale_columns) 
    
    # Scale specified columns
    scaler = MinMaxScaler()
    df = scale_columns(df, scaler, columns_to_scale)
    
    #Removes columns with zero variance
    df = df.loc[:, df.var() != 0]
    
    # Threshold for dropping columns
    threshold = 0.90

    # Apply the function and drop columns based on the threshold
    cols_to_drop = [col for col in df.columns if most_common_ratio(df[col]) > threshold]
    df_dropped = df.drop(columns=cols_to_drop)
    
    df = pd.concat([df_dropped, target_data], axis=1)
    return df

def feature_engineering_D(df, target_columns):
    columns_to_one_hot_encode = ["architectural_archetype", "stories", "soil_class", "seismic_zone", "connection_system"]
    
    target_data = df[target_columns]
    df = df.drop(columns=target_columns)
    
    # One-hot encode specified columns
    df, new_column_indices = one_hot_encode_columns(df, columns_to_one_hot_encode)
    
    empty_columns = df.columns[df.isna().all()].tolist()
    df = df.drop(columns=empty_columns)
    non_empty_columns = df.columns[df.notna().any()].tolist()
    
    for col in df.columns:
        if not is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    # Replace missing values (NaN) with zero
    imputer = SimpleImputer(strategy='constant', fill_value=0)
    df = imputer.fit_transform(df)
    df = pd.DataFrame(df, columns=non_empty_columns)
    
    all_columns = set(df.columns)
    non_scale_columns = set(columns_to_one_hot_encode)
    columns_to_scale = list(all_columns - non_scale_columns) 
    
    # Scale specified columns
    scaler = MinMaxScaler()
    df = scale_columns(df, scaler, columns_to_scale)
    
    #Removes columns with zero variance
    df = df.loc[:, df.var() != 0]
    
    # Threshold for dropping columns
    threshold = 0.90

    # Apply the function and drop columns based on the threshold
    cols_to_drop = [col for col in df.columns if most_common_ratio(df[col]) > threshold]
    df_dropped = df.drop(columns=cols_to_drop)
    
    df = pd.concat([df_dropped, target_data], axis=1)
    return df

#Do Feature Engineering for C part 1
path ='./Files/Before_Feature_Engineering'
path_FE ='Files/After_Feature_Engineering/'
file_path_C_part1 = path + '/data_C_part1.csv'
df_C_part1 = pd.read_csv(file_path_C_part1, low_memory=False)
target_column_C = ["Nail spacing [cm]", "Number sheathing panels", "Number end studs", "Total number studs","HoldDown Model / ATS"]
df_C_part1 = feature_engineering_C_part1(df_C_part1, target_column_C) 

prepared_file_path_C_part1 = path_FE + 'data_C_part1_FE.csv'
df_C_part1.to_csv(prepared_file_path_C_part1, index=False)


#Do Feature Engineering for C part 2
file_path_C_part2 = path +'/data_C_part2.csv'
df_C_part2 = pd.read_csv(file_path_C_part2, low_memory=False)
target_column_C_2 = ['Tx(s)', 'Ty(s)']
df_C_part2 = feature_engineering_C_part2(df_C_part2, target_column_C_2) 

prepared_file_path_C_part2 = path_FE + 'data_C_part2_FE.csv'
df_C_part2.to_csv(prepared_file_path_C_part2, index=False)

#Do Feature Engineering for D
file_path_D = path +'/data_D.csv'
df_D = pd.read_csv(file_path_D, low_memory=False)
target_column_D = ['Ωx', 'Ωy', 'µx', 'µy', 'CMR', 'SSF', 'ACMR', 'IO-ln θ','IO-β','LS-ln θ','LS-β', 'CP-ln θ','CP-β']
df_D = feature_engineering_D(df_D, target_column_D) 

prepared_file_path_D = path_FE + 'data_D_FE.csv'
df_D.to_csv(prepared_file_path_D, index=False)

In [26]:
df_C_part1

Unnamed: 0,L cm,xi cm,yi cm,D+0.25L,Story Area,architectural_archetype_C,architectural_archetype_D,architectural_archetype_P,architectural_archetype_Q,stories_3,...,Wall_M.2,Wall_N.1,Wall_N.2,Wall_O.1,Wall_O.2,Nail spacing [cm],Number sheathing panels,Number end studs,Total number studs,HoldDown Model / ATS
0,0.220913,0.098854,0.001517,0.798875,0.861679,0,0,1,0,0,...,0,0,0,0,0,5,2,3,16,2.8575
1,0.220913,0.342271,0.001517,0.798875,0.861679,0,0,1,0,0,...,0,0,0,0,0,5,2,3,16,2.8575
2,0.042710,0.201116,0.047037,0.798875,0.861679,0,0,1,0,0,...,0,0,0,0,0,5,2,2,10,3.1750
3,0.042710,0.240237,0.047037,0.798875,0.861679,0,0,1,0,0,...,0,0,0,0,0,5,2,2,10,3.1750
4,0.624448,0.060648,0.207283,0.798875,0.861679,0,0,1,0,0,...,0,0,0,0,0,5,2,3,23,2.8575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62258,0.341679,0.397634,0.070723,0.504677,0.662594,0,0,0,1,0,...,0,0,0,0,0,15,1,2,12,1.0000
62259,0.245950,0.397634,0.215055,0.504677,0.662594,0,0,0,1,0,...,0,0,0,0,0,15,1,2,10,1.0000
62260,0.110457,0.397634,0.329781,0.504677,0.662594,0,0,0,1,0,...,0,0,0,0,0,15,1,2,8,1.0000
62261,0.110457,0.397634,0.587728,0.504677,0.662594,0,0,0,1,0,...,0,0,0,0,0,15,1,2,8,1.0000


In [27]:
df_C_part2

Unnamed: 0,L cm_1_X_1.1,xi cm_1_X_1.1,yi cm_1_X_1.1,L cm_1_X_1.2,xi cm_1_X_1.2,yi cm_1_X_1.2,L cm_1_X_2.1,xi cm_1_X_2.1,yi cm_1_X_2.1,L cm_1_X_2.2,...,soil_class_A,soil_class_B,soil_class_C,soil_class_D,seismic_zone_1,seismic_zone_3,connection_system_ATS,connection_system_HD,Tx(s),Ty(s)
0,1.000000,1.000000,1.0,1.000000,0.724683,1.0,0.96129,0.987473,1.000000,0.532143,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.447566,0.454873
1,1.000000,1.000000,1.0,1.000000,0.724683,1.0,0.96129,0.987473,1.000000,0.532143,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.576592,0.627639
2,1.000000,1.000000,1.0,1.000000,0.724683,1.0,0.96129,0.987473,1.000000,0.532143,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.615841,0.664243
3,1.000000,1.000000,1.0,1.000000,0.724683,1.0,0.96129,0.987473,1.000000,0.532143,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.724320,0.728796
4,1.000000,1.000000,1.0,1.000000,0.724683,1.0,0.96129,0.987473,1.000000,0.532143,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.520351,0.522703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.444444,0.505695,1.0,0.633333,0.226614,1.0,1.00000,0.970755,0.992537,0.553571,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.876501,0.853845
196,0.444444,0.505695,1.0,0.633333,0.226614,1.0,1.00000,0.970755,0.992537,0.553571,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.836157,0.849928
197,0.444444,0.505695,1.0,0.633333,0.226614,1.0,1.00000,0.970755,0.992537,0.553571,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.881494,0.862129
198,0.444444,0.505695,1.0,0.633333,0.226614,1.0,1.00000,0.970755,0.992537,0.553571,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.862624,0.855580


In [28]:
df_D

Unnamed: 0,L cm_1_X_1.1,xi cm_1_X_1.1,yi cm_1_X_1.1,Nail spacing [cm]_1_X_1.1,Number sheathing panels_1_X_1.1,Number end studs_1_X_1.1,Total number studs_1_X_1.1,HoldDown Model / ATS _1_X_1.1,L cm_1_X_1.2,xi cm_1_X_1.2,...,µy,CMR,SSF,ACMR,IO-ln θ,IO-β,LS-ln θ,LS-β,CP-ln θ,CP-β
0,1.000000,1.000000,1.0,0.333333,1.0,0.500000,0.888889,0.109904,1.000000,0.724683,...,2.82,2.03,1.23,3.00,-0.078,0.379,0.533,0.453,0.887,0.487
1,1.000000,1.000000,1.0,0.333333,0.5,0.333333,0.611111,0.097692,1.000000,0.724683,...,4.23,1.78,1.30,2.77,-0.698,0.250,-0.064,0.270,0.274,0.288
2,1.000000,1.000000,1.0,0.333333,1.0,0.333333,0.611111,0.085481,1.000000,0.724683,...,4.32,3.02,1.32,4.77,-0.664,0.262,-0.025,0.275,0.311,0.297
3,1.000000,1.000000,1.0,0.333333,0.5,0.333333,0.611111,0.073269,1.000000,0.724683,...,4.16,2.53,1.37,4.14,-1.016,0.352,-0.397,0.346,-0.046,0.363
4,1.000000,1.000000,1.0,0.333333,1.0,0.500000,0.833333,0.146538,1.000000,0.724683,...,4.35,2.30,1.13,3.13,-0.370,0.318,0.290,0.340,0.647,0.369
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.444444,0.505695,1.0,1.000000,0.5,0.666667,0.555556,0.500000,0.633333,0.226614,...,4.27,1.74,1.26,2.63,-1.389,0.370,-0.808,0.362,-0.487,0.379
196,0.444444,0.505695,1.0,0.333333,1.0,0.666667,0.555556,0.692308,0.633333,0.226614,...,4.77,3.19,1.27,4.88,-1.248,0.317,-0.673,0.324,-0.363,0.358
197,0.444444,0.505695,1.0,1.000000,0.5,0.666667,0.555556,0.500000,0.633333,0.226614,...,4.28,2.85,1.26,4.31,-1.433,0.401,-0.838,0.396,-0.525,0.389
198,0.444444,0.505695,1.0,0.333333,1.0,0.666667,0.555556,0.500000,0.633333,0.226614,...,4.26,3.89,1.26,5.87,-1.371,0.356,-0.793,0.342,-0.473,0.353
