# Invoice & Failed QC Processor

In [2]:
import os
import pandas as pd
import glob

## 1. Load central references

In [4]:
os.chdir("..")
central_dir = "./processed_data/centralized" 

In [5]:
df_customers = pd.read_csv(os.path.join(central_dir, "customer_master.csv"))  
print(df_customers.sample(5))

   customer_id                customer_name          location  \
25    CUST0025               Atkinson Group        Connieview   
40    CUST0040  Humphrey, Mullins and Moore         Jasonstad   
57    CUST0057                  Clark-Greer      Kennethshire   
94    CUST0094               Walker-Morales      South Joseph   
47    CUST0047                 Freeman-Rice  South Andrewfort   

                                 email  
25  wrodriguez@richardson-caldwell.com  
40                   jake00@harris.biz  
57                  hoconnor@costa.com  
94             kellyjennifer@moore.org  
47               pricesandra@mckay.net  


In [6]:
df_pricing = pd.read_csv(os.path.join(central_dir, "equipment_pricing.csv"))  
print(df_pricing.sample(5))

         branch equipment_type  standard_price  accredited_price
12      Houston        caliper             134             174.2
27      Phoenix          scale             145             188.5
7   Los_Angeles         torque             231             300.3
16      Houston    thermometer             113             146.9
29      Phoenix     flow_meter             189             245.7


In [7]:
df_tech = pd.read_excel(os.path.join(central_dir, "technician_training_matrix.xlsx"))  
print(df_tech.sample(5))

            technician  equipment_type
62    Stephanie Cooper           scale
27  Jennifer Mccormick     thermometer
1         Austin Huynh      flow_meter
67          Lori Mcgee  pressure_gauge
49         Laura Owens         caliper


Create a set of (technician, equipment_type) pairs for easy checking

In [8]:
certified_pairs = set(zip(df_tech["technician"], df_tech["equipment_type"]))

## 2. Prepare output container

In [9]:
invoice_rows = []
failed_rows = []

In [10]:
cities = ["Chicago", "Houston", "Los_Angeles", "New_York", "Phoenix"]

In [11]:
for city in cities:
    city_dir = os.path.join("./processed_data", city)
    
    # paths
    workorders_path = os.path.join(city_dir, "workorders", "workorders.csv")
    testpoints_path = os.path.join(city_dir, "test_points", "merged_test_points.csv")
    
    # skip if missing
    if not (os.path.exists(workorders_path) and os.path.exists(testpoints_path)):
        print(f"Skipping {city} - missing files.")
        continue
    
    df_work = pd.read_csv(workorders_path)   # columns: [workorder_id, equipment_id, equipment_type, branch, customer_name, accredited, technician]
    df_test = pd.read_csv(testpoints_path)   # columns: [equipment_id, completed_date, testpoint_count]
    
    # Merge so each row from workorders has completed_date + testpoint_count
    merged = pd.merge(df_work, df_test, on="equipment_id", how="left")

    for idx, row in merged.iterrows():
        fail_reasons = []
        
        workorder_id = row["workorder_id"]
        equipment_id = row["equipment_id"]
        equipment_type = row["equipment_type"]
        branch = row["branch"]
        customer_name = row["customer_name"]
        accredited_flag = str(row["accredited"]).lower()  # "yes" or "no" or something else
        technician = row["technician"]
        
        completed_date = row["completed_date"]
        testpoint_count = row["testpoint_count"]
        
        # -- (1) Check customer matches
        # For simplicity, use exact match on "customer_name"
        if not (df_customers["customer_name"] == customer_name).any():
            fail_reasons.append("customer_not_found")
        
        # -- (2) Technician certification
        # must appear in (technician, equipment_type) pairs
        if (technician, equipment_type) not in certified_pairs:
            fail_reasons.append("tech_not_certified")
        
        # -- (3) Completed date present
        if pd.isna(completed_date) or str(completed_date).strip() == "":
            fail_reasons.append("missing_completed_date")
        
        # -- (4) Pricing found
        # We need to find a row in df_pricing with same branch & equip_type
        price_subset = df_pricing[
            (df_pricing["branch"] == branch) &
            (df_pricing["equipment_type"] == equipment_type)
        ]
        price = None
        if len(price_subset) == 0:
            fail_reasons.append("pricing_not_found")
        else:
            # If we found exactly 1 row, pick the right column
            # If multiple rows or 1 row is found, we'll handle that carefully
            pr = price_subset.iloc[0]
            if accredited_flag == "yes":
                price = pr["accredited_price"]
            else:
                price = pr["standard_price"]
        
        # -- (5) Testpoint count requirement
        # Suppose "yes" => needs 15 points, "no" => needs 10
        # Adjust if your rules differ
        if accredited_flag == "yes":
            required_points = 15
        else:
            required_points = 10
        
        # if testpoint_count is not numeric or < required_points => fail
        if pd.isna(testpoint_count) or (testpoint_count < required_points):
            fail_reasons.append("insufficient_test_points")
        
        # If we have any failure, push to failed
        if fail_reasons:
            failed_rows.append({
                "workorder_id": workorder_id,
                "equipment_id": equipment_id,
                "equipment_type": equipment_type,
                "branch": branch,
                "customer_name": customer_name,
                "accredited": accredited_flag,
                "technician": technician,
                "failed_reasons": ";".join(fail_reasons)
            })
        else:
            # Otherwise, pass QC
            invoice_rows.append({
                "workorder_id": workorder_id,
                "equipment_id": equipment_id,
                "equipment_type": equipment_type,
                "branch": branch,
                "customer_name": customer_name,
                "accredited": accredited_flag,
                "technician": technician,
                "completed_date": completed_date,
                "testpoint_count": testpoint_count,
                "price": price
            })



In [12]:
# 5. After loop, build final DataFrames
df_invoices = pd.DataFrame(invoice_rows)
df_failed = pd.DataFrame(failed_rows)

In [14]:
print(df_invoices)

     workorder_id   equipment_id  equipment_type   branch  \
0     Ch-WO-00001  Ch-EQ-00001-1          torque  Chicago   
1     Ch-WO-00001  Ch-EQ-00001-2          torque  Chicago   
2     Ch-WO-00002  Ch-EQ-00002-1          torque  Chicago   
3     Ch-WO-00002  Ch-EQ-00002-2      flow_meter  Chicago   
4     Ch-WO-00003  Ch-EQ-00003-0          torque  Chicago   
...           ...            ...             ...      ...   
1527  Ph-WO-00196  Ph-EQ-00196-1           scale  Phoenix   
1528  Ph-WO-00197  Ph-EQ-00197-0  pressure_gauge  Phoenix   
1529  Ph-WO-00198  Ph-EQ-00198-0  pressure_gauge  Phoenix   
1530  Ph-WO-00199  Ph-EQ-00199-0           scale  Phoenix   
1531  Ph-WO-00199  Ph-EQ-00199-1          torque  Phoenix   

                     customer_name accredited        technician  \
0                Rodriguez-Summers         no      Amanda Henry   
1                Rodriguez-Summers         no       Laura Owens   
2                   Kelley-Ramirez        yes       Laura Owens   

In [15]:
print(df_failed)

    workorder_id   equipment_id  equipment_type   branch  \
0    Ch-WO-00000  Ch-EQ-00000-0         caliper  Chicago   
1    Ch-WO-00001  Ch-EQ-00001-0         caliper  Chicago   
2    Ch-WO-00002  Ch-EQ-00002-0  pressure_gauge  Chicago   
3    Ch-WO-00003  Ch-EQ-00003-1  pressure_gauge  Chicago   
4    Ch-WO-00003  Ch-EQ-00003-2  pressure_gauge  Chicago   
..           ...            ...             ...      ...   
447  Ph-WO-00183  Ph-EQ-00183-1           scale  Phoenix   
448  Ph-WO-00186  Ph-EQ-00186-1     thermometer  Phoenix   
449  Ph-WO-00186  Ph-EQ-00186-2      flow_meter  Phoenix   
450  Ph-WO-00193  Ph-EQ-00193-0     thermometer  Phoenix   
451  Ph-WO-00196  Ph-EQ-00196-0      flow_meter  Phoenix   

            customer_name accredited        technician  \
0          Walker-Morales         no      Amanda Henry   
1       Rodriguez-Summers         no      Amanda Henry   
2          Kelley-Ramirez        yes      Traci Larson   
3    Wade, Cruz and White        yes       Laur

In [None]:

final_dir = "./processed_data/final"
os.makedirs(final_dir, exist_ok=True)
df_invoices.to_csv(os.path.join(final_dir, "invoices.csv"), index=False)
df_failed.to_csv(os.path.join(final_dir, "failed_qc.csv"), index=False)

print("Done! Created invoices.csv and failed_qc.csv with the new 5-step QC.")