# Data Processing
This file is to convert all test_points.xlsx to a new csv file and store the all processed data, which are stored in 'processed_data' folder, into one database named as 'calibration_data.db' stored in the 'db' folder.

## Initial variables and Packages

### 1.1 Import package

In [22]:
import os
import glob
import pandas as pd
import sqlite3

Get the current directory name

In [23]:
current_dir = os.getcwd() 
print(current_dir)

/home/ubuntu/PycharmProjects


In [24]:
target_dir = "/home/ubuntu/PycharmProjects/aaron_accredited_labs_assessment"
os.chdir(target_dir)
print("Now in:", os.getcwd())

Now in: /home/ubuntu/PycharmProjects/aaron_accredited_labs_assessment


### 1.2 City Variables

In [25]:
cities = ["Chicago", "Houston", "Los_Angeles", "New_York", "Phoenix"]

### 1.3 Path Variables

In [26]:
root_data_dir = "./raw_data"
root_processed_dir = "./processed_data"
centralized_dir = os.path.join(root_processed_dir, "centralized")
final_dir = os.path.join(root_processed_dir, "final")
db_dir = "../db"
db_path = os.path.join(db_dir, "calibration_data.db") 

os.makedirs(db_dir, exist_ok=True)
os.makedirs(final_dir, exist_ok=True)

## Processing

### 2.1 Merging test_points.xlsx into merged_test_points.csv

In [36]:
for city in cities:
    in_dir = os.path.join(root_data_dir, city, "test_points")
    if not os.path.exists(in_dir):
        continue

    records = []

    for fp in glob.glob(os.path.join(in_dir, "*_test_points.xlsx")):
        equipment_id = os.path.basename(fp).replace("_test_points.xlsx", "")
        xls = pd.ExcelFile(fp)

        # --- METADATA SHEET (optional) ---
        completed_date = None
        if "metadata" in xls.sheet_names:
            meta_df = xls.parse("metadata", header=None)
            if not meta_df.empty:
                col0_meta = meta_df.iloc[:, 0].dropna().astype(str).str.strip()
                completed_matches = col0_meta[col0_meta.str.startswith("Completed:")]
                if not completed_matches.empty:
                    line = completed_matches.iloc[0]
                    completed_date = line.split(":", 1)[1].strip()

        # --- TEST_POINTS SHEET (horizontal layout) ---
        testpoint_count = 0
        if "test_points" in xls.sheet_names:
            tp_df = xls.parse("test_points", header=None)
            if not tp_df.empty:
                # We assume the first row (row 0) has something like "TP1" "TP2" ...
                header_row = tp_df.iloc[0, :].dropna().astype(str)
                # Count how many columns match the pattern "TP" + digits
                testpoint_count = header_row.str.match(r"^TP\d+$").sum()

        records.append([equipment_id, completed_date, testpoint_count])

    df = pd.DataFrame(records, columns=["equipment_id", "completed_date", "testpoint_count"])

    # Write the CSV under processed_data/<city>/test_points
    out_dir = os.path.join(root_processed_dir, city, "test_points")
    os.makedirs(out_dir, exist_ok=True)

    out_fp = os.path.join(out_dir, "merged_test_points.csv")
    df.to_csv(out_fp, index=False)
    print(f"Saved {out_fp}")

Saved ./processed_data/Chicago/test_points/merged_test_points.csv
Saved ./processed_data/Phoenix/test_points/merged_test_points.csv
Saved ./processed_data/Los_Angeles/test_points/merged_test_points.csv
Saved ./processed_data/Houston/test_points/merged_test_points.csv
Saved ./processed_data/New_York/test_points/merged_test_points.csv
