In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_extraction import FeatureHasher
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from collections import Counter


# Custom Transformer for Feature Hashing
class FeatureHasherTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_features=100):
        self.n_features = n_features
        self.hasher = FeatureHasher(n_features=n_features, input_type='dict')

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        token_dicts = (Counter(tokens) for tokens in X.str.split())
        hashed = self.hasher.transform(token_dicts)
        return hashed.toarray()


# Load and preprocess data
df = pd.read_csv("../data/train/cleaned_data.csv")

# Drop rows with missing or invalid target
df = df.dropna(subset=['material_price'])
df = df[df['material_price'] > 0]

# Fill missing text fields
text_columns = ['material_name', 'material_type', 'material_subtype',
                'surgeon_name', 'procedure_name']
df[text_columns] = df[text_columns].fillna('missing').astype(str)

# Combine categorical text fields
df['combined_features'] = df[text_columns].agg(' '.join, axis=1)

# Log-transform target
y_log = np.log(df['material_price'].values)

# Train/Test Split
X_train_text, X_test_text, y_train_log, y_test_log = train_test_split(
    df['combined_features'], y_log, test_size=0.2, random_state=42
)


# Build Pipeline
pipeline = Pipeline([
    ('hasher', FeatureHasherTransformer(n_features=100)),
    ('model', XGBRegressor(n_estimators=100, max_depth=6,
                           learning_rate=0.1, random_state=42))
])

# Fit model
pipeline.fit(X_train_text, y_train_log)

# Predict and invert log
y_pred_log = pipeline.predict(X_test_text)
y_pred = np.exp(y_pred_log)
y_test = np.exp(y_test_log)

# Clip to non-negative
y_pred_clipped = np.clip(y_pred, 0, None)
num_clipped = np.sum(y_pred < 0)

# Evaluation
mse = mean_squared_error(y_test, y_pred_clipped)
mae = mean_absolute_error(y_test, y_pred_clipped)
r2 = r2_score(y_test, y_pred_clipped)
rmse = mean_squared_error(y_test, y_pred_clipped, squared=False)

print("=== Evaluation Metrics ===")
print(f"R² Score: {r2:.2f}")
print(f"RMSE:     {rmse:.2f}")
print(f"MSE:      {mse:.2f}")
print(f"MAE:      {mae:.2f}")
print(f"Clipped Predictions: {num_clipped}")


# Cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, df['combined_features'], y_log, cv=cv, scoring='r2')
print(f"Cross-validated R²: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")

# Save results
results_df = pd.DataFrame({
    'Actual Price': y_test,
    'Predicted Price': y_pred_clipped
})
#results_df.to_csv('predicted_vs_actual_prices.csv', index=False)
#print("CSV file has been saved as 'predicted_vs_actual_prices.csv'.")


  df = pd.read_csv("../data/train/cleaned_data.csv")


=== Evaluation Metrics ===
R² Score: 0.85
RMSE:     41.40
MSE:      1713.76
MAE:      24.27
Clipped Predictions: 0
Cross-validated R²: 0.85 ± 0.00


In [7]:
# === Apply to test_data.csv with optimization logic ===

# Load test data
test_data = pd.read_csv("../data/test/new_data.csv")
text_columns = ['material_name', 'material_type', 'material_subtype',
                'surgeon_name', 'procedure_name']
test_data[text_columns] = test_data[text_columns].fillna('missing').astype(str)
test_data['combined_features'] = test_data[text_columns].agg(' '.join, axis=1)
test_data['is_default'] = (test_data['surgeon_name'] == "Standardized").astype(int)

# Predict using trained pipeline
test_pred_log = pipeline.predict(test_data['combined_features'])
test_data['predicted_price'] = np.clip(np.exp(test_pred_log), 0, None)

# === Fixed Optimization Logic ===
results = []
for proc_id in test_data['procedure_id'].unique():
    proc_data = test_data[test_data['procedure_id'] == proc_id].copy()

    default_data = proc_data[proc_data['is_default'] == 1]
    default_materials = set(default_data['material_name'])

    surgeon_data = proc_data[proc_data['is_default'] == 0]
    surgeon_added = set(surgeon_data[surgeon_data['surgeon_specific_action'] != 'default']['material_name']) - default_materials
    all_materials = default_materials.union(surgeon_added)

    optimized_materials = {}
    for mat in all_materials:
        mat_subset = proc_data[proc_data['material_name'] == mat]
        if not mat_subset.empty:
            min_pred_price = mat_subset['predicted_price'].min()
            optimized_materials[mat] = min_pred_price

    optimized_cost = sum(optimized_materials.values())

    for mat, price in optimized_materials.items():
        results.append({
            'procedure_id': proc_id,
            'material_name': mat,
            'predicted_cost': price,
            'optimized_cost': optimized_cost
        })

# Save results
optimized_df = pd.DataFrame(results)
optimized_df.to_csv("xgb_test_optimization_results.csv", index=False)
print("\nSaved: xgb_test_optimization_results.csv")



Saved: xgb_test_optimization_results.csv
