In [153]:
import pandas as pd
import numpy as np

# Function to parse the header and extract relevant information
def parse_header(header):
    parts = header.split('_')
    extracted_info = {
        "architectural_archetype": parts[0],
        "stories": int(parts[1]),
        "soil_class": parts[4],
        "seismic_zone": int(parts[6]),
        "connection_system": parts[8]
    }
    return extracted_info

# Function to fill values in a column based on the last non-NaN value in another column
def fill_values_based_on_key(data, key_column_index, value_row_index, finishing_row_informationB):
    last_valid_key = None
    for i in range(value_row_index, finishing_row_informationB):
        key_value = data.iat[i, key_column_index]
        if pd.notna(key_value):
            last_valid_key = key_value
        if pd.isna(key_value):
            data.iat[i, key_column_index] = last_valid_key


# Function to prepare data from an Excel file and return a DataFrame
def prepare_data_from_excel(file_path, sheet_name, performance_data):
    data = pd.read_excel(file_path, sheet_name=sheet_name)
    nbr_building = len([col for col in data.columns if not 'Unnamed' in str(col)])
    nbr_story = 5
    starting_row_informationB = 14
    size_columns_informationA = 5
    row_Tx_Ty_values = 12
    parsed_data = []

    for i in range(1, nbr_building + 1):
        header = data[i][1]
        parsed_data.append([parse_header(header), header])

    columns = ["architectural_archetype", "stories", "soil_class", "seismic_zone", "connection_system"]
    new_table = []

    for item in parsed_data:
        row = [item[0][col] for col in columns]
        new_table.append(row + [item[1]])

    df = pd.DataFrame(new_table, columns=columns + ["header"])
    data2 = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
    finishing_row_informationB = data.dropna(how='all').index[-1] + 1

    if (file_path == './Design_C_ATS.xlsx' and sheet_name == 0):
        finishing_row_informationB = 284

    story_index = 3
    direction_index = 4
    fill_values_based_on_key(data2, story_index, starting_row_informationB, finishing_row_informationB)
    fill_values_based_on_key(data2, direction_index, starting_row_informationB, finishing_row_informationB)

    repetitions = finishing_row_informationB - starting_row_informationB
    d_all = []
    columns_all = []

    for i in range(nbr_building):
        d_all_bis = []

        for j in range(starting_row_informationB, finishing_row_informationB + 1):
            d_all_bis += [data2.iat[j, 6], data2.iat[j, 7], data2.iat[j, 8]]

            if i == 0:
                name_plus = '_' + str(data2.iat[j, 3]) + '_' + str(data2.iat[j, 4]) + '_' + str(data2.iat[j, 5])
                columns_all += [
                    'L cm' + name_plus,
                    'xi cm' + name_plus,
                    'yi cm' + name_plus
                ]

        d_all.append(d_all_bis)

    df_all = pd.DataFrame(d_all, columns=columns_all)
    unique_values = data2.iloc[:, 3].unique()[2:]
    d_plus_quarter_l_values = np.zeros((nbr_building, len(unique_values)))
    story_area_values = np.zeros((nbr_building, len(unique_values)))
    Tx_values = []
    Ty_values = []

    for i in range(0, nbr_building):

        for j, value in enumerate(unique_values):
            story = int(value)
            d_plus_quarter_l = data2.iat[4 + story, 11 + size_columns_informationA * i]
            d_plus_quarter_l_values[i, j] = d_plus_quarter_l
            story_area = data2.iat[4 + story, 13 + size_columns_informationA * i]
            story_area_values[i, j] = story_area

        Tx_values.append(data2.iat[row_Tx_Ty_values, 9 + size_columns_informationA * i])
        Ty_values.append(data2.iat[row_Tx_Ty_values, 10 + size_columns_informationA * i])

    df = df.drop('header', axis=1)
    resultFinal = pd.concat([df, df_all], axis=1, ignore_index=False)
    df_d_plus_quarter_l = pd.DataFrame(d_plus_quarter_l_values)
    df_d_plus_quarter_l.columns = [f'D+0.25L {i + 1}' for i in range(len(unique_values))]
    story_area_values = pd.DataFrame(story_area_values)
    story_area_values.columns = [f'Story Area {i + 1}' for i in range(len(unique_values))]
    Tx_values, Ty_values = pd.DataFrame(Tx_values), pd.DataFrame(Ty_values)
    Tx_values.columns, Ty_values.columns = ['Tx(s)'], ['Ty(s)']
    resultFinal = pd.concat([resultFinal, df_d_plus_quarter_l, story_area_values, Tx_values, Ty_values], axis=1)

    return resultFinal

# List of Excel files
files = [
    './Design_P_ATS.xlsx', './Design_P_HD.xlsx',
    './Design_D_ATS.xlsx', './Design_D_HD.xlsx',
    './Design_C_ATS.xlsx', './Design_C_HD.xlsx',
    './Design_Q_ATS.xlsx', './Design_Q_HD.xlsx'
]

# Read performance data from a separate Excel file
performance_data = pd.read_excel('./PerformanceResults.xlsx', header=None)

# Initialize an empty list to store file and sheet index information
Files_informations = []

# For each file, get all sheet names and add to Files_informations
for file in files:
    xls = pd.ExcelFile(file)
    sheet_names = xls.sheet_names
    for sheet_index, sheet_name in enumerate(sheet_names):
        Files_informations.append([file, sheet_index])

resultsFinal2 = []

# Loop through your DataFrames, reset the index, and append
for i in range(len(Files_informations)):
    resultFinal2 = prepare_data_from_excel(Files_informations[i][0], Files_informations[i][1], performance_data)
    resultsFinal2.append(resultFinal2)

# Combine DataFrames into a list
dataframes_to_merge = resultsFinal2

# Initialize the merged DataFrame with the first DataFrame
merged_df = dataframes_to_merge[0]

# Merge each DataFrame in the list with the merged DataFrame
for df in dataframes_to_merge[1:]:
    merged_df = pd.concat([merged_df, df], axis=0, ignore_index=True)

# Get a list of column names excluding 'Tx' and 'Ty'
other_columns = [col for col in merged_df.columns if col not in ['Tx(s)', 'Ty(s)']]

# Reorder the columns with 'Tx' and 'Ty' as the last two columns
new_order = other_columns + ['Tx(s)', 'Ty(s)']
merged_df = merged_df[new_order]

prepared_file_path = 'prepared_data_allTxTy.csv'
merged_df.to_csv(prepared_file_path, index=False)

In [154]:
merged_df

Unnamed: 0,architectural_archetype,stories,soil_class,seismic_zone,connection_system,L cm_1_X_1.1,xi cm_1_X_1.1,yi cm_1_X_1.1,L cm_1_X_1.2,xi cm_1_X_1.2,...,xi cm_5_Y_13.4,yi cm_5_Y_13.4,L cm_5_Y_H.4,xi cm_5_Y_H.4,yi cm_5_Y_H.4,L cm_nan_nan_J.6,xi cm_nan_nan_J.6,yi cm_nan_nan_J.6,Tx(s),Ty(s)
0,P,5,C,3,ATS,270.0,439.0,11.0,270.0,1503.0,...,,,,,,,,,0.447566,0.454873
1,P,5,B,3,ATS,270.0,439.0,11.0,270.0,1503.0,...,,,,,,,,,0.576592,0.627639
2,P,5,A,3,ATS,270.0,439.0,11.0,270.0,1503.0,...,,,,,,,,,0.615841,0.664243
3,P,5,A,3,ATS,270.0,439.0,11.0,270.0,1503.0,...,,,,,,,,,0.724320,0.728796
4,P,5,D,1,ATS,270.0,439.0,11.0,270.0,1503.0,...,,,,,,,,,0.520351,0.522703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Q,5,C,1,HD,120.0,222.0,11.0,171.0,470.0,...,1696.0,2483.0,494.0,1291.0,2247.0,352.0,1745.0,2296.0,0.876501,0.853845
196,Q,5,B,1,HD,120.0,222.0,11.0,171.0,470.0,...,1696.0,2483.0,494.0,1291.0,2247.0,352.0,1745.0,2296.0,0.836157,0.849928
197,Q,5,B,1,HD,120.0,222.0,11.0,171.0,470.0,...,1696.0,2483.0,494.0,1291.0,2247.0,352.0,1745.0,2296.0,0.881494,0.862129
198,Q,5,A,1,HD,120.0,222.0,11.0,171.0,470.0,...,1696.0,2483.0,494.0,1291.0,2247.0,352.0,1745.0,2296.0,0.862624,0.855580


In [155]:
data = merged_df
Tx_index = data.columns.get_loc("Tx(s)")
last_index = Tx_index 

# Split the data into X and Y
X = data.iloc[:, :last_index]  # Features from the beginning up to "Story Area"
Y = data.iloc[:, last_index:]  # 


In [156]:
column_to_oneHotEncode = ["architectural_archetype", "stories", "soil_class", "seismic_zone", "connection_system"]

df = X
temp_dfs = []  # List to hold temporary DataFrames
original_columns = df.columns.tolist()  # Store the original order of columns

for column in column_to_oneHotEncode:
    # Get one-hot encoded DataFrame for the current column
    one_hot = pd.get_dummies(df[column], prefix=column)
    temp_dfs.append(one_hot)

    # Drop the original column
    df = df.drop(column, axis=1)

# Concatenate all one-hot encoded DataFrames with the original DataFrame
df = pd.concat([df] + temp_dfs, axis=1)

# Reordering columns to maintain original order with one-hot encoded columns in place
new_order = []
for col in original_columns:
    if col in column_to_oneHotEncode:
        new_order.extend([c for c in df.columns if c.startswith(f"{col}_")])
    else:
        new_order.append(col)

df = df[new_order]


In [157]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df, Y, test_size=0.2, random_state=42)

In [158]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

# Replace missing values (NaN) with the median of each feature
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Create and fit the model with the imputed data
model = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))
model.fit(X_train_imputed, Y_train)
Y_pred = model.predict(X_test_imputed)

# Calculate the mean squared error (MSE) for each output
mse = mean_squared_error(Y_test, Y_pred, multioutput='raw_values')
print("Mean Squared Error for Each Output:")
print(mse)

# Calculate the overall MSE (you can choose a different aggregation method)
overall_mse = np.mean(mse)
print("Overall Mean Squared Error:")
print(overall_mse)

Mean Squared Error for Each Output:
[0.00182171 0.0014414 ]
Overall Mean Squared Error:
0.0016315542515949193
