In [1]:
import os
import glob
import pandas as pd
import sqlite3

In [1]:
ROOT_DATA_DIR = "../raw_data"
ROOT_PROCESSED_DIR = '../processed_data'
DB_DIR = "../db"
DB_PATH = os.path.join(DB_DIR, "calibration_data.db")

NameError: name 'os' is not defined

In [3]:
CENTRAL_DIR = os.path.join(ROOT_DATA_DIR, "centralized")

In [4]:
CITIES = ["Chicago", "Houston", "Los_Angeles", "New_York", "Phoenix"]

In [5]:
os.makedirs(DB_DIR, exist_ok=True)

In [6]:
print("Pipeline starting...")
print("Current directory:", os.getcwd())
print("DB path:", DB_PATH)

Pipeline starting...
Current directory: /home/ubuntu/PycharmProjects/aaron_accredited_labs_assessment/code
DB path: ../db/calibration_data.db


In [7]:
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()

In [8]:
cur.execute("DROP TABLE IF EXISTS workorders;")
cur.execute("DROP TABLE IF EXISTS test_points;")
cur.execute("DROP TABLE IF EXISTS invoices;")
cur.execute("DROP TABLE IF EXISTS failed_qc;")
conn.commit()

In [9]:
# 2.1 Workorders table
cur.execute("""
CREATE TABLE workorders (
    workorder_id   TEXT,
    equipment_id   TEXT,
    equipment_type TEXT,
    branch         TEXT,
    customer_name  TEXT,
    accredited     TEXT,
    technician     TEXT,
    PRIMARY KEY (workorder_id, equipment_id)   -- composite PK
);
""")

<sqlite3.Cursor at 0x7e31101d47c0>

In [10]:
# 2.2 Test_points summary table (just storing aggregated testpoint_count)
cur.execute("""
CREATE TABLE IF NOT EXISTS test_points (
    equipment_id     TEXT,
    completed_date   TEXT,
    testpoint_count  INTEGER,
    PRIMARY KEY (equipment_id)          
);
""")

<sqlite3.Cursor at 0x7e31101d47c0>

In [11]:
# 2.3 Invoices table
cur.execute("""
CREATE TABLE IF NOT EXISTS invoices (
    invoice_id INTEGER PRIMARY KEY AUTOINCREMENT,
    work_completion_date TEXT,
    work_order_number TEXT,
    branch TEXT,
    accreditation_type TEXT,
    equipment_id TEXT,
    equipment_type TEXT,
    price REAL,
    customer_id TEXT,
    customer_name TEXT,
    customer_email TEXT
);
""")

<sqlite3.Cursor at 0x7e31101d47c0>

In [12]:
# 2.4 Failed QC table
cur.execute("""
CREATE TABLE IF NOT EXISTS failed_qc (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    workorder_id TEXT,
    equipment_id TEXT,
    equipment_type TEXT,
    branch TEXT,
    customer_name TEXT,
    accredited TEXT,
    technician TEXT,
    failed_reasons TEXT
);
""")

<sqlite3.Cursor at 0x7e31101d47c0>

In [13]:
conn.commit()
print("Database tables created/verified.")

Database tables created/verified.


In [14]:
df_customers = pd.read_csv(os.path.join(CENTRAL_DIR, "customer_master.csv"))
df_pricing = pd.read_csv(os.path.join(CENTRAL_DIR, "equipment_pricing.csv"))
df_tech = pd.read_excel(os.path.join(CENTRAL_DIR, "technician_training_matrix.xlsx"))

certified_pairs = set(zip(df_tech["technician"], df_tech["equipment_type"]))

print("References loaded:")
print("  customers:", len(df_customers))
print("  pricing:", len(df_pricing))
print("  tech matrix:", len(df_tech))

References loaded:
  customers: 100
  pricing: 30
  tech matrix: 75


In [15]:
for city in CITIES:
    workorders_csv = os.path.join(ROOT_DATA_DIR, city, "workorders", "workorders.csv")
    if not os.path.exists(workorders_csv):
        print(f"[{city}] Missing workorders.csv; skipping insertion.")
        continue

    df_work = pd.read_csv(workorders_csv)
    # Columns: [workorder_id, equipment_id, equipment_type, branch, customer_name, accredited, technician]
    
    inserted_count = 0
    for _, row in df_work.iterrows():
        sql = """
        INSERT OR REPLACE INTO workorders
            (workorder_id, equipment_id, equipment_type, branch, customer_name, accredited, technician)
        VALUES (?, ?, ?, ?, ?, ?, ?);
        """
        cur.execute(sql, (
            row["workorder_id"],
            row["equipment_id"],
            row["equipment_type"],
            row["branch"],
            row["customer_name"],
            row["accredited"],
            row["technician"]
        ))
        inserted_count += 1

    conn.commit()
    print(f"Inserted/updated {inserted_count} workorders for city={city}.")

Inserted/updated 409 workorders for city=Chicago.
Inserted/updated 387 workorders for city=Houston.
Inserted/updated 397 workorders for city=Los_Angeles.
Inserted/updated 393 workorders for city=New_York.
Inserted/updated 398 workorders for city=Phoenix.


In [22]:
for city in CITIES:
    in_dir = os.path.join(ROOT_DATA_DIR, city, "test_points")
    if not os.path.exists(in_dir):
        print(f"[{city}] No test_points folder; skipping.")
        continue

    print(f"\n=== Processing test_points in {city} ===")
    records = []  

    for fp in glob.glob(os.path.join(in_dir, "*_test_points.xlsx")):
        # 1. Derive equipment_id from filename
        equipment_id = os.path.basename(fp).replace("_test_points.xlsx", "")

        # 2. Open Excel
        xls = pd.ExcelFile(fp)

        # --- Parse METADATA sheet to find a "Completed: YYYY-MM-DD" line
        completed_date = None
        if "metadata" in xls.sheet_names:
            meta_df = xls.parse("metadata", header=None)
            if not meta_df.empty:
                col0 = meta_df.iloc[:, 0].dropna().astype(str).str.strip()
                matches = col0[col0.str.startswith("Completed:")]
                if not matches.empty:
                    line = matches.iloc[0]

                    completed_date = line.split(":", 1)[1].strip()

        # --- Parse TEST_POINTS sheet to count columns named "TP\d+"
        testpoint_count = 0
        if "test_points" in xls.sheet_names:
            tp_df = xls.parse("test_points", header=None)
            if not tp_df.empty:
                header_row = tp_df.iloc[0, :].dropna().astype(str)
                testpoint_count = header_row.str.match(r"^TP\d+$").sum()

        # 3. Append to records as a definite integer
        records.append((equipment_id, completed_date, int(testpoint_count)))

    ########
    # 2) Optionally save to a CSV under processed_data/<city>/test_points
    ########
    if records:
        df = pd.DataFrame(records, columns=["equipment_id", "completed_date", "testpoint_count"])

        out_dir = os.path.join(ROOT_PROCESSED_DIR, city, "test_points")
        os.makedirs(out_dir, exist_ok=True)
        out_fp = os.path.join(out_dir, "merged_test_points.csv")
        df.to_csv(out_fp, index=False)

        print(f"  => Saved CSV: {out_fp}  (rows={len(df)})")

        ########
        # 3) Optionally insert into your DB (test_points table) as integers
        ########
        for eq_id, c_date, t_count in records:
            # Just to confirm in debug:
            print("  Insert:", eq_id, c_date, t_count, type(t_count))

            # Insert (make sure the table is declared with testpoint_count INTEGER)
            sql = """
                INSERT INTO test_points (equipment_id, completed_date, testpoint_count)
                VALUES (?, ?, ?);
            """
            cur.execute(sql, (eq_id, c_date, t_count))

        conn.commit()
        print(f"  => Inserted {len(records)} rows into 'test_points' table.")


=== Processing test_points in Chicago ===
  => Saved CSV: ../processed_data/Chicago/test_points/merged_test_points.csv  (rows=409)
  Insert: Ch-EQ-00166-0 2024-11-23 15 <class 'int'>
  Insert: Ch-EQ-00077-1 2025-03-28 15 <class 'int'>
  Insert: Ch-EQ-00189-2 2024-06-14 15 <class 'int'>
  Insert: Ch-EQ-00197-2 2024-12-10 10 <class 'int'>
  Insert: Ch-EQ-00053-0 2025-03-09 10 <class 'int'>
  Insert: Ch-EQ-00112-1 2024-10-02 10 <class 'int'>
  Insert: Ch-EQ-00169-0 2024-10-19 10 <class 'int'>
  Insert: Ch-EQ-00005-1 2024-11-23 15 <class 'int'>
  Insert: Ch-EQ-00056-1 2024-10-20 15 <class 'int'>
  Insert: Ch-EQ-00144-1 2024-09-02 15 <class 'int'>
  Insert: Ch-EQ-00142-1 2025-03-03 10 <class 'int'>
  Insert: Ch-EQ-00006-0 2025-02-06 15 <class 'int'>
  Insert: Ch-EQ-00176-0 2024-12-31 10 <class 'int'>
  Insert: Ch-EQ-00072-0 2024-08-25 8 <class 'int'>
  Insert: Ch-EQ-00003-1 2024-06-03 15 <class 'int'>
  Insert: Ch-EQ-00190-0 2024-10-26 10 <class 'int'>
  Insert: Ch-EQ-00114-1 2025-03-20 10

In [23]:
df_work_db = pd.read_sql_query("SELECT * FROM workorders", conn)
df_tp_db = pd.read_sql_query("SELECT * FROM test_points", conn)

In [24]:
merged = pd.merge(df_work_db, df_tp_db, on="equipment_id", how="left")

invoice_rows = []
failed_rows = []


In [25]:
print(merged)

     workorder_id   equipment_id  equipment_type   branch  \
0     Ch-WO-00000  Ch-EQ-00000-0         caliper  Chicago   
1     Ch-WO-00001  Ch-EQ-00001-0         caliper  Chicago   
2     Ch-WO-00001  Ch-EQ-00001-1          torque  Chicago   
3     Ch-WO-00001  Ch-EQ-00001-2          torque  Chicago   
4     Ch-WO-00002  Ch-EQ-00002-0  pressure_gauge  Chicago   
...           ...            ...             ...      ...   
1979  Ph-WO-00196  Ph-EQ-00196-1           scale  Phoenix   
1980  Ph-WO-00197  Ph-EQ-00197-0  pressure_gauge  Phoenix   
1981  Ph-WO-00198  Ph-EQ-00198-0  pressure_gauge  Phoenix   
1982  Ph-WO-00199  Ph-EQ-00199-0           scale  Phoenix   
1983  Ph-WO-00199  Ph-EQ-00199-1          torque  Phoenix   

                     customer_name accredited        technician  \
0                   Walker-Morales         no      Amanda Henry   
1                Rodriguez-Summers         no      Amanda Henry   
2                Rodriguez-Summers         no      Amanda Henry   

In [26]:
for i, row in merged.iterrows():
    fail_reasons = []

    workorder_id    = row["workorder_id"]
    equipment_id    = row["equipment_id"]
    equipment_type  = row["equipment_type"]
    branch          = row["branch"]
    customer_name   = row["customer_name"]
    accredited_flag = str(row["accredited"]).lower()  # yes/no
    technician      = row["technician"]
    completed_date  = row["completed_date"]
    testpoint_val   = row["testpoint_count"]  # should be int

    # 1) Customer check
    if not (df_customers["customer_name"] == customer_name).any():
        fail_reasons.append("customer_not_found")

    # 2) Tech certification
    if (technician, equipment_type) not in certified_pairs:
        fail_reasons.append("tech_not_certified")

    # 3) Completed date
    if pd.isna(completed_date) or str(completed_date).strip() == "":
        fail_reasons.append("missing_completed_date")

    # 4) Pricing
    price_subset = df_pricing[
        (df_pricing["branch"] == branch) &
        (df_pricing["equipment_type"] == equipment_type)
    ]
    price = None
    if len(price_subset) == 0:
        fail_reasons.append("pricing_not_found")
    else:
        pr = price_subset.iloc[0]
        if accredited_flag == "yes":
            price = pr["accredited_price"]
        else:
            price = pr["standard_price"]

    # 5) EXACT required test points
    required_points = 15 if accredited_flag == "yes" else 10
    if pd.isna(testpoint_val) or (testpoint_val != required_points):
        fail_reasons.append("incorrect_testpoint_count")

    # Decide pass/fail
    if fail_reasons:
        failed_rows.append({
            "workorder_id":   workorder_id,
            "equipment_id":   equipment_id,
            "equipment_type": equipment_type,
            "branch":         branch,
            "customer_name":  customer_name,
            "accredited":     accredited_flag,
            "technician":     technician,
            "failed_reasons": ";".join(fail_reasons)
        })
    else:
        # pass => build invoice
        cust_info = df_customers[df_customers["customer_name"] == customer_name]
        if len(cust_info) == 1:
            cid    = cust_info.iloc[0]["customer_id"]
            cemail = cust_info.iloc[0]["email"]
        else:
            cid, cemail = None, None

        invoice_rows.append({
            "work_completion_date": completed_date,
            "work_order_number":    workorder_id,
            "branch":               branch,
            "accreditation_type":   accredited_flag,
            "equipment_id":         equipment_id,
            "equipment_type":       equipment_type,
            "price":                price,
            "customer_id":          cid,
            "customer_name":        customer_name,
            "customer_email":       cemail
        })

print(f"QC complete. {len(invoice_rows)} invoices, {len(failed_rows)} fails.")

QC complete. 1532 invoices, 452 fails.


In [27]:
for inv in invoice_rows:
    sql = """
    INSERT INTO invoices
    (work_completion_date, work_order_number, branch, accreditation_type,
     equipment_id, equipment_type, price, customer_id, customer_name, customer_email)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
    """
    cur.execute(sql, (
        inv["work_completion_date"],
        inv["work_order_number"],
        inv["branch"],
        inv["accreditation_type"],
        inv["equipment_id"],
        inv["equipment_type"],
        inv["price"],
        inv["customer_id"],
        inv["customer_name"],
        inv["customer_email"]
    ))


In [28]:
for fail in failed_rows:
    sql = """
    INSERT INTO failed_qc
    (workorder_id, equipment_id, equipment_type, branch, customer_name,
     accredited, technician, failed_reasons)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?);
    """
    cur.execute(sql, (
        fail["workorder_id"],
        fail["equipment_id"],
        fail["equipment_type"],
        fail["branch"],
        fail["customer_name"],
        fail["accredited"],
        fail["technician"],
        fail["failed_reasons"]
    ))

conn.commit()

In [30]:
df_invoices_db = pd.read_sql_query("SELECT * FROM invoices", conn)
df_failed_db = pd.read_sql_query("SELECT * FROM failed_qc", conn)

In [31]:
print("INVOICES:")
display(df_invoices_db)

print("FAILED QC:")
display(df_failed_db)

conn.close()
print("Pipeline finished. All data is in ../db/calibration_data.db.")

INVOICES:


Unnamed: 0,invoice_id,work_completion_date,work_order_number,branch,accreditation_type,equipment_id,equipment_type,price,customer_id,customer_name,customer_email
0,1,2024-09-25,Ch-WO-00001,Chicago,no,Ch-EQ-00001-1,torque,b'\xad\x00\x00\x00\x00\x00\x00\x00',CUST0043,Rodriguez-Summers,paul64@thompson-gonzalez.org
1,2,2024-06-18,Ch-WO-00001,Chicago,no,Ch-EQ-00001-2,torque,b'\xad\x00\x00\x00\x00\x00\x00\x00',CUST0043,Rodriguez-Summers,paul64@thompson-gonzalez.org
2,3,2025-01-06,Ch-WO-00002,Chicago,yes,Ch-EQ-00002-1,torque,224.9,CUST0037,Kelley-Ramirez,deborahwright@smith.net
3,4,2024-11-05,Ch-WO-00002,Chicago,yes,Ch-EQ-00002-2,flow_meter,260.0,CUST0037,Kelley-Ramirez,deborahwright@smith.net
4,5,2024-05-21,Ch-WO-00003,Chicago,no,Ch-EQ-00003-0,torque,b'\xad\x00\x00\x00\x00\x00\x00\x00',CUST0067,"Wade, Cruz and White",nathaniel92@miller.com
...,...,...,...,...,...,...,...,...,...,...,...
1527,1528,2024-04-28,Ph-WO-00196,Phoenix,yes,Ph-EQ-00196-1,scale,188.5,CUST0052,Hammond-Guerrero,michael60@wright.com
1528,1529,2024-10-06,Ph-WO-00197,Phoenix,yes,Ph-EQ-00197-0,pressure_gauge,256.1,CUST0068,Riddle-Faulkner,larry94@stevens.net
1529,1530,2024-07-22,Ph-WO-00198,Phoenix,no,Ph-EQ-00198-0,pressure_gauge,b'\xc5\x00\x00\x00\x00\x00\x00\x00',CUST0020,"Copeland, Torres and Morales",suzanne41@green.com
1530,1531,2024-06-22,Ph-WO-00199,Phoenix,no,Ph-EQ-00199-0,scale,b'\x91\x00\x00\x00\x00\x00\x00\x00',CUST0028,Neal-Kelley,dalvarado@evans.com


FAILED QC:


Unnamed: 0,id,workorder_id,equipment_id,equipment_type,branch,customer_name,accredited,technician,failed_reasons
0,1,Ch-WO-00000,Ch-EQ-00000-0,caliper,Chicago,Walker-Morales,no,Amanda Henry,tech_not_certified
1,2,Ch-WO-00001,Ch-EQ-00001-0,caliper,Chicago,Rodriguez-Summers,no,Amanda Henry,tech_not_certified
2,3,Ch-WO-00002,Ch-EQ-00002-0,pressure_gauge,Chicago,Kelley-Ramirez,yes,Traci Larson,tech_not_certified
3,4,Ch-WO-00003,Ch-EQ-00003-1,pressure_gauge,Chicago,"Wade, Cruz and White",yes,Laura Owens,tech_not_certified
4,5,Ch-WO-00003,Ch-EQ-00003-2,pressure_gauge,Chicago,"Wade, Cruz and White",yes,Laura Owens,tech_not_certified
...,...,...,...,...,...,...,...,...,...
447,448,Ph-WO-00183,Ph-EQ-00183-1,scale,Phoenix,Kelley-Young,no,Carol Rasmussen,incorrect_testpoint_count
448,449,Ph-WO-00186,Ph-EQ-00186-1,thermometer,Phoenix,Mccann-Hoffman,no,Lori Mcgee,tech_not_certified
449,450,Ph-WO-00186,Ph-EQ-00186-2,flow_meter,Phoenix,Mccann-Hoffman,yes,Stephanie Cooper,tech_not_certified
450,451,Ph-WO-00193,Ph-EQ-00193-0,thermometer,Phoenix,Wells Inc,yes,Lori Mcgee,tech_not_certified


Pipeline finished. All data is in ../db/calibration_data.db.
