In [11]:
import pandas as pd
import numpy as np

# Function to parse the header and extract relevant information
def parse_header(header):
    parts = header.split('_')
    extracted_info = {
        "architectural_archetype": parts[0],
        "stories": int(parts[1]),
        "soil_class": parts[4],
        "seismic_zone": int(parts[6]),
        "connection_system": parts[8]
    }
    return extracted_info

# Function to fill values in a column based on the last non-NaN value in another column
def fill_values_based_on_key(data, key_column_index, value_row_index, finishing_row_informationB):
    """
    Fill the values in the value column based on the last non-NaN value in the key column.

    :param data: The 2D list (list of lists) representing the data.
    :param key_column_index: The index of the column to use as the key.
    :param value_row_index: The index of the row where values are to be filled.
    :return: None; the operation modifies the data list in place.
    """
    last_valid_key = None
    for i in range(value_row_index, finishing_row_informationB+1):
        key_value = data.iat[i, key_column_index]
        if pd.notna(key_value):  # Check if the key column value is not NaN
            last_valid_key = key_value
        if pd.isna(key_value):  # Check if the value column is NaN
            data.iat[i, key_column_index] = last_valid_key

# Function to prepare data for CSV export
# This function create the csv file with Information A and B as X and some part of information C
def prepare_data_to_csv1(file_path, sheet_name, data_D):
    data = pd.read_excel(file_path, sheet_name=sheet_name)
    
    # To find the number of buildings, we need to find how many unamed columns there are
    nbr_building = len([col for col in data.columns if not 'Unnamed' in str(col)])
    nbr_story = 5
    starting_row_informationB = 14
    size_columns_informationA = 5
    row_Tx_Ty_values = 12

    # Extract Type A Information

    #Parsing headers and store the results
    parsed_data = []
    for i in range(1, nbr_building + 1):
        header = data[i][1]
        parsed_data.append([parse_header(header), header])

    columns = ["architectural_archetype", "stories", "soil_class", "seismic_zone", "connection_system"]

    new_table = []
    for item in parsed_data:
        row = [item[0][col] for col in columns]
        new_table.append(row + [item[1]])

    df = pd.DataFrame(new_table, columns=columns + ["header"])

    data2 = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
    
    #We want to find the finishing row of the tables (we don't know sine it is an excel file without headers)
    # We want to consider rows that have at least one non-NaN value:
    finishing_row_informationB = data.dropna(how='all').index[-1] + 1
    
    #There is an exception for file_path './Design_C_ATS.xlsx'. There are additional informations only in this file that is not needed.
    if(file_path == './Files/Raw_Files/Design_C_ATS.xlsx' and sheet_name == 0):
        finishing_row_informationB = 284

    story_index = 3
    direction_index = 4
    fill_values_based_on_key(data2, story_index, starting_row_informationB, finishing_row_informationB)
    fill_values_based_on_key(data2, direction_index, starting_row_informationB, finishing_row_informationB)

    repetitions = finishing_row_informationB - starting_row_informationB

    #Add type A information
    repeated_df = pd.DataFrame(np.repeat(df.values, repetitions, axis=0), columns=df.columns)

    #Add type B information
    df1 = [data2.iloc[starting_row_informationB:finishing_row_informationB, 3:9]
           .rename(columns={data2.columns[3]: "Story",
                            data2.columns[4]: "Direction",
                            data2.columns[5]: "Wall",
                            data2.columns[6]: "L cm",
                            data2.columns[7]: "xi cm",
                            data2.columns[8]: "yi cm"}) for _ in range(nbr_building)]

    #Add type C information
    dfs = [data2.iloc[starting_row_informationB:finishing_row_informationB,
                      9 + size_columns_informationA * i: 14 + size_columns_informationA * i]
           .rename(columns={data2.columns[9 + size_columns_informationA * i]: "Nail spacing [cm]",
                            data2.columns[10 + size_columns_informationA * i]: "Number sheathing panels",
                            data2.columns[11 + size_columns_informationA * i]: "Number end studs",
                            data2.columns[12 + size_columns_informationA * i]: "Total number studs",
                            data2.columns[13 + size_columns_informationA * i]: "HoldDown Model / ATS"}) for i in range(nbr_building)]

    result2 = pd.concat(df1, ignore_index=True)
    result3 = pd.concat(dfs, ignore_index=True)

    d_plus_quarter_l_values = []
    story_area_values = []

    for i in range(0, nbr_building):
        for j in range(0, finishing_row_informationB - starting_row_informationB):
            story = int(result2.iat[j, 0])
            d_plus_quarter_l = data2.iat[4 + story, 11 + size_columns_informationA * i]
            d_plus_quarter_l_values.append(d_plus_quarter_l)
            story_area = data2.iat[4 + story, 13 + size_columns_informationA * i]
            story_area_values.append(story_area)

    result2['D+0.25L'] = d_plus_quarter_l_values
    result2['Story Area'] = story_area_values

    repeated_df = repeated_df.drop('header', axis=1)
    resultFinal = pd.concat([repeated_df, result2, result3], axis=1, ignore_index=False)

    return resultFinal

# This function create the csv file with Information A and B as X and the rest of information C as Y, especially only Tx and Ty
# Function to prepare data from an Excel file and return a DataFrame
def prepare_data_from_excel(file_path, sheet_name, performance_data):
    data = pd.read_excel(file_path, sheet_name=sheet_name)
    nbr_building = len([col for col in data.columns if not 'Unnamed' in str(col)])
    nbr_story = 5
    starting_row_informationB = 14
    size_columns_informationA = 5
    row_Tx_Ty_values = 12
    parsed_data = []

    for i in range(1, nbr_building + 1):
        header = data[i][1]
        parsed_data.append([parse_header(header), header])

    columns = ["architectural_archetype", "stories", "soil_class", "seismic_zone", "connection_system"]
    new_table = []

    for item in parsed_data:
        row = [item[0][col] for col in columns]
        new_table.append(row + [item[1]])

    df = pd.DataFrame(new_table, columns=columns + ["header"])
    data2 = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
    finishing_row_informationB = data.dropna(how='all').index[-1] + 1

    if (file_path == './Files/Raw_Files/Design_C_ATS.xlsx' and sheet_name == 0):
        finishing_row_informationB = 284

    story_index = 3
    direction_index = 4
    fill_values_based_on_key(data2, story_index, starting_row_informationB, finishing_row_informationB)
    fill_values_based_on_key(data2, direction_index, starting_row_informationB, finishing_row_informationB)

    repetitions = finishing_row_informationB - starting_row_informationB
    d_all = []
    columns_all = []

    for i in range(nbr_building):
        d_all_bis = []

        for j in range(starting_row_informationB, finishing_row_informationB + 1):
            d_all_bis += [data2.iat[j, 6], data2.iat[j, 7], data2.iat[j, 8]]

            if i == 0:
                name_plus = '_' + str(data2.iat[j, 3]) + '_' + str(data2.iat[j, 4]) + '_' + str(data2.iat[j, 5])
                columns_all += [
                    'L cm' + name_plus,
                    'xi cm' + name_plus,
                    'yi cm' + name_plus
                ]

        d_all.append(d_all_bis)

    df_all = pd.DataFrame(d_all, columns=columns_all)
    unique_values = data2.iloc[:, 3].unique()[2:]
    d_plus_quarter_l_values = np.zeros((nbr_building, len(unique_values)))
    story_area_values = np.zeros((nbr_building, len(unique_values)))
    Tx_values = []
    Ty_values = []

    for i in range(0, nbr_building):

        for j, value in enumerate(unique_values):
            story = int(value)
            d_plus_quarter_l = data2.iat[4 + story, 11 + size_columns_informationA * i]
            d_plus_quarter_l_values[i, j] = d_plus_quarter_l
            story_area = data2.iat[4 + story, 13 + size_columns_informationA * i]
            story_area_values[i, j] = story_area

        Tx_values.append(data2.iat[row_Tx_Ty_values, 9 + size_columns_informationA * i])
        Ty_values.append(data2.iat[row_Tx_Ty_values, 10 + size_columns_informationA * i])

    df = df.drop('header', axis=1)
    resultFinal = pd.concat([df, df_all], axis=1, ignore_index=False)
    df_d_plus_quarter_l = pd.DataFrame(d_plus_quarter_l_values)
    df_d_plus_quarter_l.columns = [f'D+0.25L {i + 1}' for i in range(len(unique_values))]
    story_area_values = pd.DataFrame(story_area_values)
    story_area_values.columns = [f'Story Area {i + 1}' for i in range(len(unique_values))]
    Tx_values, Ty_values = pd.DataFrame(Tx_values), pd.DataFrame(Ty_values)
    Tx_values.columns, Ty_values.columns = ['Tx(s)'], ['Ty(s)']
    resultFinal = pd.concat([resultFinal, df_d_plus_quarter_l, story_area_values, Tx_values, Ty_values], axis=1)

    return resultFinal


path = './Files/Raw_Files'
# List of Excel files
files = ['/Design_P_ATS.xlsx', '/Design_P_HD.xlsx',
         '/Design_D_ATS.xlsx', '/Design_D_HD.xlsx',
         '/Design_C_ATS.xlsx', '/Design_C_HD.xlsx',
         '/Design_Q_ATS.xlsx', '/Design_Q_HD.xlsx']
data_D = pd.read_excel(path + '/PerformanceResults.xlsx', header=None)

Files_informations = []

# For each file, get all sheet names and add to Files_informations
for file in files:
    xls = pd.ExcelFile(path + file)
    sheet_names = xls.sheet_names
    for sheet_index, sheet_name in enumerate(sheet_names):
        Files_informations.append([path + file, sheet_index])

resultsFinal = []
resultsFinal2 = []

for i in range(len(Files_informations)):
    
    resultFinal = prepare_data_to_csv1(Files_informations[i][0], Files_informations[i][1], data_D)
    resultsFinal.append(resultFinal)
    
    resultFinal2 = prepare_data_from_excel(Files_informations[i][0], Files_informations[i][1], data_D)
    resultsFinal2.append(resultFinal2)

resultFinal = pd.concat(resultsFinal, axis=0, ignore_index=True)
prepared_file_path = 'Files/Before_Feature_Engineering/data_C_part1.csv'
resultFinal.to_csv(prepared_file_path, index=False)


# Combine DataFrames into a list
dataframes_to_merge = resultsFinal2

merged_df = dataframes_to_merge[0]

# Merge each DataFrame in the list with the merged DataFrame
for df in dataframes_to_merge[1:]:
    merged_df = pd.concat([merged_df, df], axis=0, ignore_index=True)

# Get a list of column names excluding 'Tx' and 'Ty'
other_columns = [col for col in merged_df.columns if col not in ['Tx(s)', 'Ty(s)']]

# Reorder the columns with 'Tx' and 'Ty' as the last two columns
new_order = other_columns + ['Tx(s)', 'Ty(s)']
merged_df = merged_df[new_order]

prepared_file_path2 = 'Files/Before_Feature_Engineering/data_C_part2.csv'
merged_df.to_csv(prepared_file_path2, index=False)

In [12]:
resultFinal

Unnamed: 0,architectural_archetype,stories,soil_class,seismic_zone,connection_system,Story,Direction,Wall,L cm,xi cm,yi cm,D+0.25L,Story Area,Nail spacing [cm],Number sheathing panels,Number end studs,Total number studs,HoldDown Model / ATS
0,P,5,C,3,ATS,1,X,1.1,270,439.0,11,162306.884491,4.918760e+06,5,2,3,16,2.8575
1,P,5,C,3,ATS,1,X,1.2,270,1503,11,162306.884491,4.918760e+06,5,2,3,16,2.8575
2,P,5,C,3,ATS,1,X,2.1,149,886,134,162306.884491,4.918760e+06,5,2,2,10,3.175
3,P,5,C,3,ATS,1,X,2.2,149,1057,134,162306.884491,4.918760e+06,5,2,2,10,3.175
4,P,5,C,3,ATS,1,X,3.1,544,272,567,162306.884491,4.918760e+06,5,2,3,23,2.8575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62258,Q,5,A,1,HD,5,Y,J.1,352,1745,198,126413.242325,4.366071e+06,15,1,2,12,1
62259,Q,5,A,1,HD,5,Y,J.2,287,1745,588,126413.242325,4.366071e+06,15,1,2,10,1
62260,Q,5,A,1,HD,5,Y,J.3,195,1745,898,126413.242325,4.366071e+06,15,1,2,8,1
62261,Q,5,A,1,HD,5,Y,J.4,195,1745,1595,126413.242325,4.366071e+06,15,1,2,8,1


In [13]:
merged_df

Unnamed: 0,architectural_archetype,stories,soil_class,seismic_zone,connection_system,L cm_1_X_1.1,xi cm_1_X_1.1,yi cm_1_X_1.1,L cm_1_X_1.2,xi cm_1_X_1.2,...,xi cm_5_X_13.4,yi cm_5_X_13.4,L cm_5_Y_H.4,xi cm_5_Y_H.4,yi cm_5_Y_H.4,L cm_5_Y_J.6,xi cm_5_Y_J.6,yi cm_5_Y_J.6,Tx(s),Ty(s)
0,P,5,C,3,ATS,270.0,439.0,11.0,270.0,1503.0,...,,,,,,,,,0.447566,0.454873
1,P,5,B,3,ATS,270.0,439.0,11.0,270.0,1503.0,...,,,,,,,,,0.576592,0.627639
2,P,5,A,3,ATS,270.0,439.0,11.0,270.0,1503.0,...,,,,,,,,,0.615841,0.664243
3,P,5,A,3,ATS,270.0,439.0,11.0,270.0,1503.0,...,,,,,,,,,0.724320,0.728796
4,P,5,D,1,ATS,270.0,439.0,11.0,270.0,1503.0,...,,,,,,,,,0.520351,0.522703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Q,5,C,1,HD,120.0,222.0,11.0,171.0,470.0,...,1696.0,2483.0,494.0,1291.0,2247.0,352.0,1745.0,2296.0,0.876501,0.853845
196,Q,5,B,1,HD,120.0,222.0,11.0,171.0,470.0,...,1696.0,2483.0,494.0,1291.0,2247.0,352.0,1745.0,2296.0,0.836157,0.849928
197,Q,5,B,1,HD,120.0,222.0,11.0,171.0,470.0,...,1696.0,2483.0,494.0,1291.0,2247.0,352.0,1745.0,2296.0,0.881494,0.862129
198,Q,5,A,1,HD,120.0,222.0,11.0,171.0,470.0,...,1696.0,2483.0,494.0,1291.0,2247.0,352.0,1745.0,2296.0,0.862624,0.855580
