In [7]:
import os

import random
import numpy as np
import pandas as pd
from pathlib import Path

from faker import Faker

from openpyxl import load_workbook


In [8]:
sheet_name = 'DRNG - TEST'


files_folder = Path.cwd() / 'Source' / 'DataLake'
files_paths = [
    files_folder / files_name
    for files_name in
    os.listdir(files_folder)
]

print(f"Files found: {len(files_paths)}")


Files found: 100


In [11]:
def read_and_parse_template(file_path):
    bids_cols = {}
    wb = load_workbook(file_path)
    ws = wb[sheet_name]

    # identify bids columns
    for row in ws.iter_rows(28, 28):
        for cell in row:
            if cell.value == 'BID':

                contractor = ws[f'{cell.column_letter}27'].value
                ws.cell(row=cell.row, column=cell.column, value=contractor)
                bids_cols[contractor] = cell.column

    # setup max column index
    bids_max_col_idx = max(bids_cols.values())

    # read each row from desc to lastest bids col
    data = []
    for row in ws.iter_rows(28, None, min_col=1, max_col=bids_max_col_idx):
        row_data = [cell.value for cell in row]
        if row_data[0] is None:
            break
        data.append(row_data)

    # read project data
    project_data = []
    for row in ws.iter_rows(21, None, min_col=bids_max_col_idx+1, max_col=bids_max_col_idx+2):
        row_data = [cell.value for cell in row]
        if row_data[0] == "LOW":
            break
        project_data.append(row_data)

    # map project data
    project_data = {
        row[0].replace(':', ''): row[1]
        for row in project_data if row[0] not in [
            # 'LOWEST BIDDER:',
            'BIDS RECEIVED:']
    }

    # transform to dataframe
    df = pd.DataFrame(data[1:], columns=data[0])
    df.drop(columns=['LOW', 'AVERAGE', 'COUNT', np.nan], inplace=True)
    df = df.melt(id_vars=['DESCRIPTION', 'UNIT'], var_name='CONTRACTOR', value_name='BID_VALUE')
    df = df.assign(**project_data)

    return df

In [12]:
data_frames_list = []
for file_path in files_paths:
    try:
        df = read_and_parse_template(file_path)
        data_frames_list.append(df)
    except Exception as e:
        print(f'Error reading file {file_path}: {e}')


In [13]:
df = pd.concat(data_frames_list)
df.shape

(149256, 10)

In [14]:
output_file_path = Path.cwd() / 'Source' / 'Result' / 'output.xlsx'
df.to_excel(output_file_path, sheet_name='melt_data' ,index=False)