In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Replace 'path_to_file.csv' with the actual path to your CSV file
file_path = './prepared_data_all.csv'
df = pd.read_csv(file_path, low_memory=False)

#We are one-hot encoding categorical columns
# One hot encode architectural archetype : Types : C, D, P and Q
# One hot number of stories or floors Types 3,4,5,6
# One hot encode Soil class : Types : A,B,C,D (riskier)
# One hot encode zones : Types : 1,2,3
# One hot encode connection type : Types HD (hold down), ATS (anchor tiedown system)
# One hot encode : the story where the wall is
# Encode Direction : X and Y
column_to_oneHotEncode = ["architectural_archetype", "stories", "soil_class", "seismic_zone", "connection_system", 'Story', 'Direction', 'Wall']


temp_dfs = []  # List to hold temporary DataFrames
original_columns = df.columns.tolist()  # Store the original order of columns

for column in column_to_oneHotEncode:
    # Get one-hot encoded DataFrame for the current column
    one_hot = pd.get_dummies(df[column], prefix=column)
    temp_dfs.append(one_hot)

    # Drop the original column
    df = df.drop(column, axis=1)

# Concatenate all one-hot encoded DataFrames with the original DataFrame
df = pd.concat([df] + temp_dfs, axis=1)

# Reordering columns to maintain original order with one-hot encoded columns in place
new_order = []
for col in original_columns:
    if col in column_to_oneHotEncode:
        new_order.extend([c for c in df.columns if c.startswith(f"{col}_")])
    else:
        new_order.append(col)

df = df[new_order]

#We are going to deal with continous values

# Convert 'xi [cm]' to numeric, coercing errors to NaN
df['xi [cm]'] = pd.to_numeric(df['xi [cm]'], errors='coerce')

# Check for NaN values after the conversion
if df['xi [cm]'].isnull().any():
    print("NaN values found. Filling with the mean.")
    # Fill NaN values with the mean of the column
    df['xi [cm]'].fillna(df['xi [cm]'].mean(), inplace=True)


# Assuming 'df' is your DataFrame
scaler = MinMaxScaler()

# List of columns to scale
columns_to_scale = ['L [cm]', 'xi [cm]', 'D+0.25L', 'Story Area']

# Apply the scaler to the DataFrame
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    
prepared_file_path = 'data_processed.csv'
df.to_csv(prepared_file_path, index=False)

In [5]:
df.head()

Unnamed: 0,architectural_archetype_C,architectural_archetype_D,architectural_archetype_P,architectural_archetype_Q,stories_3,stories_4,stories_5,stories_6,soil_class_A,soil_class_B,...,L [cm],xi [cm],D+0.25L,Story Area,Nail spacing [cm],Number sheathing panels,Number end studs,Total number studs,Tx(s),Ty(s)
0,0,0,1,0,0,0,1,0,0,0,...,0.220913,0.098854,0.78725,0.861679,5,2,3,16,0.447566,0.454873
1,0,0,1,0,0,0,1,0,0,0,...,0.220913,0.342271,0.78725,0.861679,5,2,3,16,0.447566,0.454873
2,0,0,1,0,0,0,1,0,0,0,...,0.04271,0.201116,0.78725,0.861679,5,2,2,10,0.447566,0.454873
3,0,0,1,0,0,0,1,0,0,0,...,0.04271,0.240237,0.78725,0.861679,5,2,2,10,0.447566,0.454873
4,0,0,1,0,0,0,1,0,0,0,...,0.624448,0.060648,0.78725,0.861679,5,2,3,23,0.447566,0.454873
