In [4]:
"""
CNC Pricing Model
"""

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import pickle
import warnings
warnings.filterwarnings('ignore')

# ========================================
# LOAD AND CLEAN DATA
# ========================================

print("="*60)
print("LOADING UPDATED DATA")
print("="*60)

# Load CSV file with new data
df = pd.read_csv('data/cnc_historical_jobs.csv')

print(f"\n Raw data loaded: {len(df)} jobs")

# ========================================
# CLEAN THE DATA
# ========================================

print("\n--- Cleaning Data ---")

# Remove commas from price column and convert to float
if df['price'].dtype == 'object':
    print("Price column has text format (with commas)")
    df['price'] = df['price'].str.replace(',', '').str.replace('â‚¦', '').str.strip()
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    print("Converted prices to numbers")

# Clean other numeric columns that might have commas
numeric_columns = ['thickness_mm', 'width_mm', 'height_mm', 'cutting_time_minutes']
for col in numeric_columns:
    if col in df.columns and df[col].dtype == 'object':
        df[col] = df[col].astype(str).str.replace(',', '').str.strip()
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Remove any rows with missing/invalid data
print(f"\n--- Checking for missing values ---")
missing_before = len(df)
df = df.dropna()
missing_after = len(df)

if missing_before > missing_after:
    print(f"Removed {missing_before - missing_after} rows with missing/invalid data")
else:
    print("No missing values!")

print(f"\n Clean data: {len(df)} jobs")
print(f"Price range: â‚¦{df['price'].min():,.2f} - â‚¦{df['price'].max():,.2f}")
print(f"Average price: â‚¦{df['price'].mean():,.2f}")

# Show first few rows to verify
print("\n--- Sample Data (first 5 rows) ---")
print(df.head())

# ========================================
# PREPARE DATA
# ========================================

print("\n" + "="*60)
print("PREPARING DATA FOR TRAINING")
print("="*60)

# One-hot encode categorical variables
df_encoded = df.copy()
df_encoded = pd.get_dummies(df_encoded, columns=['material', 'cutting_type'], 
                            prefix=['mat', 'cut'])

# Separate features and target
X = df_encoded.drop('price', axis=1)
y = df_encoded['price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\n Training samples: {len(X_train)}")
print(f" Testing samples: {len(X_test)}")

# ========================================
# TRAIN MODEL
# ========================================

print("\n" + "="*60)
print("TRAINING NEW MODEL")
print("="*60)

model = RandomForestRegressor(
    n_estimators=150,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("\nTraining in progress...")
model.fit(X_train, y_train)
print("Model trained successfully!")

# ========================================
# EVALUATE PERFORMANCE
# ========================================

print("\n" + "="*60)
print("MODEL PERFORMANCE")
print("="*60)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"\n Accuracy Metrics:")
print(f"   Mean Absolute Error: â‚¦{mae:,.2f}")
print(f"   RÂ² Score: {r2:.3f}")
print(f"   Average % Error: {mape:.1f}%")

print(f"\n Performance Assessment:")
if r2 > 0.85:
    print(f"   EXCELLENT! Model is highly accurate (RÂ²={r2:.3f})")
elif r2 > 0.70:
    print(f"   GOOD! Model has good accuracy (RÂ²={r2:.3f})")
elif r2 > 0.50:
    print(f"   MODERATE. Consider adding more data (RÂ²={r2:.3f})")
else:
    print(f"   POOR. Need more training data (RÂ²={r2:.3f})")

# ========================================
# FEATURE IMPORTANCE
# ========================================

print("\n" + "="*60)
print("TOP PRICING FACTORS")
print("="*60)

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False).head(10)

print("\nTop 10 factors that affect price:")
for idx, row in feature_importance.iterrows():
    bar = 'â–ˆ' * int(row['Importance'] * 100)
    print(f"{row['Feature']:30s} {bar} {row['Importance']:.3f}")

# ========================================
# SAVE NEW MODEL
# ========================================

print("\n" + "="*60)
print("SAVING MODEL")
print("="*60)

# Save model with column information
model_data = {
    'model': model,
    'columns': X.columns,
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'total_jobs': len(df),
    'r2_score': r2,
    'mae': mae
}

with open('data/cnc_laser_pricing_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("Model saved to: data/cnc_laser_pricing_model.pkl")
print(f"\n Model Info:")
print(f"   - Trained on: {len(df)} jobs")
print(f"   - Accuracy (RÂ²): {r2:.3f}")
print(f"   - Average Error: â‚¦{mae:,.2f}")
print(f"   - Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")

# ========================================
# TEST PREDICTION
# ========================================

print("\n" + "="*60)
print("TESTING NEW MODEL")
print("="*60)

# Test with a sample job
test_job = {
    'material': 'Acrylic',
    'thickness_mm': 6,
    'num_letters': 15,
    'num_shapes': 1,
    'complexity_score': 3,
    'has_intricate_details': 0,
    'width_mm': 500,
    'height_mm': 300,
    'cutting_type': 'Laser Cutting',
    'cutting_time_minutes': 30,
    'quantity': 1,
    'rush_job': 0
}

# Prepare for prediction
test_df = pd.DataFrame([test_job])
test_df = pd.get_dummies(test_df, columns=['material', 'cutting_type'], 
                        prefix=['mat', 'cut'])

for col in X.columns:
    if col not in test_df.columns:
        test_df[col] = 0

test_df = test_df[X.columns]
test_price = model.predict(test_df)[0]

print("\n Test Job:")
print("Acrylic (6mm), 500x300mm, 15 letters")
print(f"\n Predicted Price: â‚¦{test_price:,.2f}")

print("\n" + "="*60)
print("RETRAINING COMPLETE!")
print("="*60)
print("\n Next steps:")
print("1. Check the RÂ² score above")
print("2. If RÂ² > 0.7, your model is ready!")
print("3. Go back to VS Code and restart your web server")
print("4. Test the new pricing accuracy")
print("="*60)

LOADING UPDATED DATA

ðŸ“Š Raw data loaded: 50 jobs

--- Cleaning Data ---

--- Checking for missing values ---
No missing values!

 Clean data: 50 jobs
Price range: â‚¦2,500.00 - â‚¦20,000.00
Average price: â‚¦8,180.00

--- Sample Data (first 5 rows) ---
  material  thinkness_mm  num_letters  num_shapes  complexity_score  \
0  Acrylic             3           52           8                 3   
1  Acrylic             3            0          10                 1   
2  Acrylic             3           40           0                 2   
3  Acrylic             3          109          20                 4   
4  Acrylic             3           30          41                 3   

   has_intricate_details  width_mm  height_mm   cutting_type  \
0                      1     914.4      914.4  laser cutting   
1                      1     914.4      914.4  laser cutting   
2                      1     609.6      914.4  laser cutting   
3                      1    1219.2      914.4  laser cutting 