In [2]:
import pandas as pd
import joblib
import os

In [3]:
import pandas as pd
import joblib
import os

# --- 1. CONFIGURATION ---
# (Make sure these paths are correct for your machine)
GOLD_FILE = "gold_data_CLEANED.csv"
SILVER_FILE = "pakwheels_silver_data.csv"
MODEL_FILE = r"inspector_model.pkl"       # Use 'r' for raw string to fix path errors
VECTORIZER_FILE = r"tfidf_vectorizer.pkl"
OUTPUT_FILE = "MASTER_CAR_DATASET.csv"

# --- 2. LOAD THE AI BRAIN (This defines 'vectorizer') ---
print("Loading AI Inspector...")
if not os.path.exists(MODEL_FILE) or not os.path.exists(VECTORIZER_FILE):
    print(f"CRITICAL ERROR: Could not find model files at: {MODEL_FILE}")
    print("Please check the file path.")
else:
    model = joblib.load(MODEL_FILE)
    vectorizer = joblib.load(VECTORIZER_FILE)
    print("-> Model and Vectorizer loaded successfully.")

# --- 3. LOAD SILVER DATA ---
print("Loading Silver Data...")
if not os.path.exists(SILVER_FILE):
    print(f"ERROR: Could not find {SILVER_FILE}")
else:
    df_silver = pd.read_csv(SILVER_FILE)
    
    # --- 4. CLEAN SILVER DATA (Using the fixed function) ---
    def clean_nums(x):
        if not isinstance(x, str): return None
        # Split by newline to remove "Managed by PakWheels"
        if "\n" in x: x = x.split("\n")[0]
        # Cleanup
        x = x.lower().replace(',', '').replace('km', '').replace('cc', '').replace('pkr', '').strip()
        # Units
        if 'crore' in x: return float(x.replace('crore', '').strip()) * 10000000
        if 'lacs' in x or 'lac' in x: return float(x.replace('lacs', '').replace('lac', '').strip()) * 100000
        # Final Number
        if x.replace('.', '', 1).isdigit(): return float(x)
        return None

    df_silver['price'] = df_silver['price'].apply(clean_nums)
    df_silver['mileage'] = df_silver['mileage'].apply(clean_nums)
    df_silver['engine'] = df_silver['engine'].apply(clean_nums)
    
    # Drop rows that couldn't be cleaned
    df_silver = df_silver.dropna(subset=['price', 'mileage', 'engine'])
    df_silver['description'] = df_silver['description'].fillna('')
    
    print(f"-> Silver Data Ready: {len(df_silver)} cars.")

    # --- 5. PREDICT SCORES (The Magic Step) ---
    print("AI Inspector is grading the Silver cars...")
    # This is where your error happened. Now 'vectorizer' is guaranteed to exist.
    X_silver = vectorizer.transform(df_silver['description'])
    predicted_scores = model.predict(X_silver)

    df_silver['inspection_score'] = predicted_scores.round(1)
    df_silver['data_source'] = 'silver_predicted'

    # --- 6. MERGE WITH GOLD ---
    print("Loading Gold Data...")
    if os.path.exists(GOLD_FILE):
        df_gold = pd.read_csv(GOLD_FILE)
        df_gold['data_source'] = 'gold_verified'
        
        print("Merging Datasets...")
        df_master = pd.concat([df_gold, df_silver], ignore_index=True)
        
        # Deduplicate
        df_master = df_master.drop_duplicates(subset=['url'])
        
        # Save
        df_master.to_csv(OUTPUT_FILE, index=False)
        print(f"\nSUCCESS! Master Dataset created: {OUTPUT_FILE}")
        print(f"Total Cars: {len(df_master)}")
    else:
        print(f"ERROR: Could not find {GOLD_FILE}")

Loading AI Inspector...
-> Model and Vectorizer loaded successfully.
Loading Silver Data...
-> Silver Data Ready: 3302 cars.
AI Inspector is grading the Silver cars...
Loading Gold Data...
Merging Datasets...

SUCCESS! Master Dataset created: MASTER_CAR_DATASET.csv
Total Cars: 2772
