In [66]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [68]:
# Load your dataset (assuming cleaned_data is a pandas DataFrame)
cleaned_data = pd.read_csv('cleaned_data.csv', sep=',')
cleaned_data

  cleaned_data = pd.read_csv('cleaned_data.csv', sep=',')


Unnamed: 0,organization_id,OrganizationName,specialty,procedure_id,procedure_name,protocol_id,protocol_name,status,codes,material_id,...,material_subtype,manufacturer,hold,article_number,surgeon_specific_action,surgeon_name,surgeon_surname,Specialty,material_price,total_procedure_price
0,b23d4475-e7c9-4c57-a017-61f82113e265,Rochdale Infirmary,OBSTETRICS_GYNAECOLOGY,295,Diagnostic Laparoscopy +/- Treatment including...,7766,Diagnostic Laparoscopy +/- Treatment including...,Published,,85583,...,,,1,,ADDED,G Ahmad,(Ms Ahmad),,33.50,33.500000
1,b23d4475-e7c9-4c57-a017-61f82113e265,Rochdale Infirmary,OBSTETRICS_GYNAECOLOGY,295,Diagnostic Laparoscopy +/- Treatment including...,7766,Diagnostic Laparoscopy +/- Treatment including...,Published,,85585,...,,,1,,ADDED,G Ahmad,(Ms Ahmad),,201.59,201.591800
2,b23d4475-e7c9-4c57-a017-61f82113e265,Rochdale Infirmary,OBSTETRICS_GYNAECOLOGY,295,Diagnostic Laparoscopy +/- Treatment including...,7766,Diagnostic Laparoscopy +/- Treatment including...,Published,,85597,...,SUTURES,,1,,ADDED,G Ahmad,(Ms Ahmad),,92.74,92.742800
3,b23d4475-e7c9-4c57-a017-61f82113e265,Rochdale Infirmary,OBSTETRICS_GYNAECOLOGY,295,Diagnostic Laparoscopy +/- Treatment including...,7766,Diagnostic Laparoscopy +/- Treatment including...,Published,,85598,...,,,1,,ADDED,G Ahmad,(Ms Ahmad),,35.84,35.836000
4,b23d4475-e7c9-4c57-a017-61f82113e265,Rochdale Infirmary,OBSTETRICS_GYNAECOLOGY,295,Diagnostic Laparoscopy +/- Treatment including...,7766,Diagnostic Laparoscopy +/- Treatment including...,Published,,85602,...,,,1,,ADDED,G Ahmad,(Ms Ahmad),,37.84,37.836000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93605,cc213081-6d8b-4149-89e0-8099d678b243,IJsselland Ziekenhuis,UROLOGY,52435,Urethrastrictuur Opheffen - Sachse,9178,Ureterotomie vlgs. Sachse,Published,"36496-1,36496,36496-1\URO",1348,...,LIQUIDS_MEDICINES,,0,,DEFAULT,,,,,0.000000
93606,cc213081-6d8b-4149-89e0-8099d678b243,IJsselland Ziekenhuis,UROLOGY,52435,Urethrastrictuur Opheffen - Sachse,9178,Ureterotomie vlgs. Sachse,Published,"36496-1,36496,36496-1\URO",1715,...,LIQUIDS_MEDICINES,,0,,DEFAULT,,,,5.91,5.909000
93607,cc213081-6d8b-4149-89e0-8099d678b243,IJsselland Ziekenhuis,UROLOGY,52435,Urethrastrictuur Opheffen - Sachse,9178,Ureterotomie vlgs. Sachse,Published,"36496-1,36496,36496-1\URO",2815,...,,,0,,DEFAULT,,,,122.75,122.752644
93608,cc213081-6d8b-4149-89e0-8099d678b243,IJsselland Ziekenhuis,UROLOGY,52435,Urethrastrictuur Opheffen - Sachse,9178,Ureterotomie vlgs. Sachse,Published,"36496-1,36496,36496-1\URO",14456,...,DEVICE,,0,,DEFAULT,,,,442.89,442.888300


In [70]:
def prepare_data(cleaned_data):
    df = cleaned_data.copy()
    
    # Fill NaN with 'DEFAULT' for categorical columns
    categorical_cols = ['procedure_name', 'material_name', 'surgeon_specific_action', 'specialty']
    df[categorical_cols] = df[categorical_cols].fillna('DEFAULT')
    df['material_price'] = df['material_price'].fillna(0.0)
    
    # # Fill NaN with 0.0 for material_price (since it's numerical)
    # df['material_price'] = df['material_price'].fillna(0.0)    # Encode categorical variables
    le_procedure = LabelEncoder()
    le_material = LabelEncoder()
    le_action = LabelEncoder()
    le_specialty = LabelEncoder()
    
    df['procedure_encoded'] = le_procedure.fit_transform(df['procedure_name'])
    df['material_encoded'] = le_material.fit_transform(df['material_name'])
    df['action_encoded'] = le_action.fit_transform(df['surgeon_specific_action'])
    df['specialty_encoded'] = le_specialty.fit_transform(df['specialty'])

    
    return df, le_procedure, le_material, le_action, le_specialty

In [72]:
print('The data contains the following NaN values:' + str(cleaned_data.isna().sum().sum()))
print('The data contains the following Null values:' + str(cleaned_data.isnull().sum().sum()))

The data contains the following NaN values:436269
The data contains the following Null values:436269


In [74]:
def train_and_predict_all_materials(cleaned_data, procedure_name):
    # Prepare the data
    df, le_procedure, le_material, le_action, le_specialty = prepare_data(cleaned_data)
    
    # Since we want all materials, we'll use the actual data instead of prediction
    # Filter for the specific procedure
    procedure_encoded = le_procedure.transform([procedure_name])[0]
    procedure_data = df[df['procedure_encoded'] == procedure_encoded]
    
    if len(procedure_data) == 0:
        raise ValueError(f"No data found for procedure: {procedure_name}")
    
    # Create results list
    results = []
    for _, row in procedure_data.iterrows():
        result = {
            'procedure_name': procedure_name,
            'specialty': le_specialty.inverse_transform([row['specialty_encoded']])[0],
            'material_name': le_material.inverse_transform([row['material_encoded']])[0],
            'material_price': round(float(row['material_price']), 2),
            'surgeon_specific_action': le_action.inverse_transform([row['action_encoded']])[0]
        }
        results.append(result)
    
    return results

**regressor**

In [84]:
def train_predictive_model(cleaned_data):
    df, le_procedure, le_material, le_action, le_specialty = prepare_data(cleaned_data)
    
    # Features and targets
    y = df[['material_price']]
    X = df[['material_encoded', 'procedure_encoded', 'action_encoded', 'specialty_encoded']]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train single model
    model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
    model.fit(X_train, y_train)
    
    return model, X_test, y_test  # Returning 3 values

In [86]:
def get_model_performance(cleaned_data):
    model, X_test, y_test = train_predictive_model(cleaned_data)
    score = model.score(X_test, y_test)
    print(f"Model R² Score: {score}")
    return score

In [88]:
try:
    procedure_to_predict = "Laparoscopic Sterilisation / Salpingectomy for Sterilisation"
    predictions = train_and_predict_all_materials(cleaned_data, procedure_to_predict)
    print(f"Existing materials for {procedure_to_predict}:")
    for pred in predictions:
        print(f"Material: {pred['material_name']}")
        print(f"Price: ${pred['material_price']}")
        print(f"Surgeon Specific: {pred['surgeon_specific_action']}")
        print(f"Specialty: {pred['specialty']}")
        print("---")
except Exception as e:
    print(f"An error occurred: {str(e)}")

# Check model performance
performance = get_model_performance(cleaned_data)

Existing materials for Laparoscopic Sterilisation / Salpingectomy for Sterilisation:
Material: normal saline
Price: $16.35
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: suction and irrigation
Price: $206.59
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: harmonic handpiece
Price: $37.84
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: harmonic scalpel
Price: $201.59
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: verres needle
Price: $134.37
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: syringe 10ml
Price: $11.99
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: sterishot applicator
Price: $31.5
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: curved enseal 37cm
Price: $31.5
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: normal saline
Price: $13.35
Surgeon Specific: DEFAULT
Specia

  return fit_method(estimator, *args, **kwargs)


Model R² Score: 0.6508913367669306


**classifier**

In [90]:
def train_predictive_model(cleaned_data):
    df, le_procedure, le_material, le_action, le_specialty = prepare_data(cleaned_data)
    
    # Features and targets
    X = df[['procedure_encoded']]
    y = df[['material_encoded', 'material_price', 'action_encoded', 'specialty_encoded']]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train single model
    model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
    model.fit(X_train, y_train)
    
    return model, X_test, y_test  # Returning 3 values

In [91]:
def get_model_performance(cleaned_data):
    model, X_test, y_test = train_predictive_model(cleaned_data)
    score = model.score(X_test, y_test)
    print(f"Model R² Score: {score}")
    return score

In [92]:
try:
    procedure_to_predict = "Laparoscopic Sterilisation / Salpingectomy for Sterilisation"
    predictions = train_and_predict_all_materials(cleaned_data, procedure_to_predict)
    print(f"Existing materials for {procedure_to_predict}:")
    for pred in predictions:
        print(f"Material: {pred['material_name']}")
        print(f"Price: ${pred['material_price']}")
        print(f"Surgeon Specific: {pred['surgeon_specific_action']}")
        print(f"Specialty: {pred['specialty']}")
        print("---")
except Exception as e:
    print(f"An error occurred: {str(e)}")

# Check model performance
performance = get_model_performance(cleaned_data)

Existing materials for Laparoscopic Sterilisation / Salpingectomy for Sterilisation:
Material: normal saline
Price: $16.35
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: suction and irrigation
Price: $206.59
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: harmonic handpiece
Price: $37.84
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: harmonic scalpel
Price: $201.59
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: verres needle
Price: $134.37
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: syringe 10ml
Price: $11.99
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: sterishot applicator
Price: $31.5
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: curved enseal 37cm
Price: $31.5
Surgeon Specific: DEFAULT
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: normal saline
Price: $13.35
Surgeon Specific: DEFAULT
Specia

ValueError: Unknown label type: continuous-multioutput. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

**lasso**

In [134]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputRegressor

# Prepare the data
def prepare_data(cleaned_data):
    df = cleaned_data.copy()
    selected_cols = ['procedure_name', 'material_name', 'surgeon_specific_action', 'material_price']
    df = df[selected_cols]
    
    # Fill NaN values
    categorical_cols = ['procedure_name', 'material_name', 'surgeon_specific_action']
    df[categorical_cols] = df[categorical_cols].fillna('DEFAULT')
    df['material_price'] = df['material_price'].fillna(0.0)
    
    # Encode categorical variables
    le_procedure = LabelEncoder()
    le_material = LabelEncoder()
    le_action = LabelEncoder()
    
    df['procedure_encoded'] = le_procedure.fit_transform(df['procedure_name'])
    df['material_encoded'] = le_material.fit_transform(df['material_name'])
    df['action_encoded'] = le_action.fit_transform(df['surgeon_specific_action'])
    
    return df, le_procedure, le_material, le_action

# Retrieve existing materials
def train_and_predict_all_materials(cleaned_data, procedure_name):
    df, le_procedure, le_material, le_action = prepare_data(cleaned_data)
    procedure_encoded = le_procedure.transform([procedure_name])[0]
    procedure_data = df[df['procedure_encoded'] == procedure_encoded]
    
    if len(procedure_data) == 0:
        raise ValueError(f"No data found for procedure: {procedure_name}")
    
    results = []
    for _, row in procedure_data.iterrows():
        result = {
            'procedure_name': procedure_name,
            'material_name': le_material.inverse_transform([int(row['material_encoded'])])[0],
            'material_price': round(float(row['material_price']), 2),
            'surgeon_specific_action': le_action.inverse_transform([int(row['action_encoded'])])[0]
        }
        results.append(result)
    
    return results

# Predictive model with Lasso regression
def train_predictive_model(cleaned_data):
    df, le_procedure, le_material, le_action = prepare_data(cleaned_data)
    
    # Features and targets
    X = df[['procedure_encoded']]
    y = df[['material_encoded', 'material_price', 'action_encoded']]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train Lasso model wrapped in MultiOutputRegressor
    lasso = Lasso(alpha=1.0, random_state=42)  # alpha controls regularization strength
    model = MultiOutputRegressor(lasso)
    model.fit(X_train, y_train)
    
    return model, X_test, y_test

# Get model performance
def get_model_performance(cleaned_data):
    model, X_test, y_test = train_predictive_model(cleaned_data)
    score = model.score(X_test, y_test)
    print(f"Model R² Score with Lasso: {score}")
    return score

# Example usage for retrieval
try:
    procedure_to_predict = "Laparoscopic Sterilisation / Salpingectomy for Sterilisation"
    predictions = train_and_predict_all_materials(cleaned_data, procedure_to_predict)
    print(f"Existing materials for {procedure_to_predict}:")
    for pred in predictions:
        print(f"Material: {pred['material_name']}")
        print(f"Price: ${pred['material_price']}")
        print(f"Surgeon Specific: {pred['surgeon_specific_action']}")
        print("---")
except Exception as e:
    print(f"An error occurred: {str(e)}")

# Check model performance
performance = get_model_performance(cleaned_data)

Existing materials for Laparoscopic Sterilisation / Salpingectomy for Sterilisation:
Material: normal saline
Price: $16.35
Surgeon Specific: DEFAULT
---
Material: suction and irrigation
Price: $206.59
Surgeon Specific: DEFAULT
---
Material: harmonic handpiece
Price: $37.84
Surgeon Specific: DEFAULT
---
Material: harmonic scalpel
Price: $201.59
Surgeon Specific: DEFAULT
---
Material: verres needle
Price: $134.37
Surgeon Specific: DEFAULT
---
Material: syringe 10ml
Price: $11.99
Surgeon Specific: DEFAULT
---
Material: sterishot applicator
Price: $31.5
Surgeon Specific: DEFAULT
---
Material: curved enseal 37cm
Price: $31.5
Surgeon Specific: DEFAULT
---
Material: normal saline
Price: $13.35
Surgeon Specific: DEFAULT
---
Material: suction and irrigation
Price: $204.59
Surgeon Specific: DEFAULT
---
Material: harmonic handpiece
Price: $41.84
Surgeon Specific: DEFAULT
---
Material: harmonic scalpel
Price: $201.59
Surgeon Specific: DEFAULT
---
Material: verres needle
Price: $142.37
Surgeon Spec

**all procedures**

In [120]:
from sklearn.preprocessing import LabelEncoder

def prepare_data(cleaned_data):
    df = cleaned_data.copy()
    # Fill NaN with 'DEFAULT' for categorical columns
    categorical_cols = ['procedure_name', 'material_name', 'surgeon_specific_action', 'specialty']
    df[categorical_cols] = df[categorical_cols].fillna('DEFAULT')
    df['material_price'] = df['material_price'].fillna(0.0)
    
    # Encode categorical variables
    le_procedure = LabelEncoder()
    le_material = LabelEncoder()
    le_action = LabelEncoder()
    le_specialty = LabelEncoder()
    
    df['procedure_encoded'] = le_procedure.fit_transform(df['procedure_name'])
    df['material_encoded'] = le_material.fit_transform(df['material_name'])
    df['action_encoded'] = le_action.fit_transform(df['surgeon_specific_action'])
    df['specialty_encoded'] = le_specialty.fit_transform(df['specialty'])
    
    return df, le_procedure, le_material, le_action, le_specialty

In [122]:
from sklearn.ensemble import RandomForestRegressor

def train_predictive_model(cleaned_data):
    df, le_procedure, le_material, le_action, le_specialty = prepare_data(cleaned_data)
    
    feature_names = ['procedure_encoded', 'material_encoded', 'action_encoded', 'specialty_encoded']
    X = df[feature_names]  # Ensure order is consistent
    y = df['material_price']
    
    # Train model on full dataset
    model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
    model.fit(X, y)
    
    return model, df, le_procedure, le_material, le_action, le_specialty

In [124]:
def list_all_procedure_materials(cleaned_data, n_procedures=None):
    # Get trained model and necessary components
    model, df, le_procedure, le_material, le_action, le_specialty = train_predictive_model(cleaned_data)
    
    # Get unique procedures
    unique_procedures = df['procedure_name'].unique()
    
    # Limit to n_procedures if specified
    if n_procedures is not None:
        unique_procedures = unique_procedures[:n_procedures]
    
    results = {}
    feature_names = ['procedure_encoded', 'material_encoded', 'action_encoded', 'specialty_encoded']    
    # Process each procedure
    for procedure in unique_procedures:
        procedure_encoded = le_procedure.transform([procedure])[0]
        procedure_data = df[df['procedure_encoded'] == procedure_encoded]
        
        procedure_materials = []
        for _, row in procedure_data.iterrows():
            # Create a DataFrame for prediction to maintain feature names
            input_data = pd.DataFrame([[row['procedure_encoded'], 
                                       row['material_encoded'], 
                                       row['action_encoded'], 
                                       row['specialty_encoded']]],
                                     columns=feature_names)
            
            material_info = {
                'material_name': le_material.inverse_transform([row['material_encoded']])[0],
                'actual_price': round(float(row['material_price']), 2),
                'predicted_price': round(float(model.predict(input_data)[0]), 2),
                'surgeon_specific_action': le_action.inverse_transform([row['action_encoded']])[0],
                'specialty': le_specialty.inverse_transform([row['specialty_encoded']])[0]
            }
            procedure_materials.append(material_info)
        
        results[procedure] = procedure_materials
    
    return results

In [128]:
try:
    n_procedures_to_show = 2
    # Get all procedure materials
    all_procedures_data = list_all_procedure_materials(cleaned_data, n_procedures=n_procedures_to_show)    
    # Print results
    print(f"Materials for the first {n_procedures_to_show} procedure(s) in the dataset:")
    print("==========================================")
    
    for procedure, materials in all_procedures_data.items():
        print(f"\nProcedure: {procedure}")
        print("-----------------")
        for material in materials:
            print(f"Material: {material['material_name']}")
            print(f"Actual Price: ${material['actual_price']}")
            print(f"Predicted Price: ${material['predicted_price']}")
            print(f"Surgeon Specific Action: {material['surgeon_specific_action']}")
            print(f"Specialty: {material['specialty']}")
            print("---")
            
    # Calculate and print model performance (still based on full training data)
    model, df, _, _, _, _ = train_predictive_model(cleaned_data)
    feature_names = ['procedure_encoded', 'material_encoded', 'action_encoded', 'specialty_encoded']
    X = df[feature_names]  # Ensure order is consistent
    y = df['material_price']
    score = model.score(X, y)
    print(f"\nModel R² Score on full dataset: {score:.4f}")

    # Predict and calculate MAE
    y_pred = model.predict(X)
    mae = mean_absolute_error(y, y_pred)
    print(f"Mean Absolute Error on test set: ${mae:.2f}")
except Exception as e:
    print(f"An error occurred: {str(e)}")

Materials for the first 2 procedure(s) in the dataset:

Procedure: Diagnostic Laparoscopy +/- Treatment including Cystectomy, Salpingectomy / Oophrectomy
-----------------
Material: laparoscopic scissors
Actual Price: $33.5
Predicted Price: $64.4
Surgeon Specific Action: ADDED
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: suction and irrigation
Actual Price: $201.59
Predicted Price: $199.19
Surgeon Specific Action: ADDED
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: prolene 2-0 on straight needle (8571)
Actual Price: $92.74
Predicted Price: $65.71
Surgeon Specific Action: ADDED
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: bowel sizers
Actual Price: $35.84
Predicted Price: $38.54
Surgeon Specific Action: ADDED
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: harmonic handpiece
Actual Price: $37.84
Predicted Price: $64.09
Surgeon Specific Action: ADDED
Specialty: OBSTETRICS_GYNAECOLOGY
---
Material: harmonic scalpel
Actual Price: $201.59
Predicted Price: $66.28
Surgeon Specific Act

**split**

In [112]:
def train_predictive_model(cleaned_data):
    df, le_procedure, le_material, le_action, le_specialty = prepare_data(cleaned_data)
    
    # Define feature names in the correct order
    feature_names = ['procedure_encoded', 'material_encoded', 'action_encoded', 'specialty_encoded']
    X = df[feature_names]
    y = df['material_price']
    
    # Split the data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model on training data
    model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
    model.fit(X_train, y_train)
    
    return model, X_train, X_test, y_train, y_test, df, le_procedure, le_material, le_action, le_specialty

In [114]:
def list_all_procedure_materials(cleaned_data, n_procedures=None):
    # Get trained model and necessary components
    model, X_train, X_test, y_train, y_test, df, le_procedure, le_material, le_action, le_specialty = train_predictive_model(cleaned_data)
    
    # Get unique procedures from the test set
    test_df = pd.DataFrame(X_test, columns=X_test.columns)
    test_df['procedure_encoded'] = test_df['procedure_encoded'].astype(int)
    test_procedure_codes = test_df['procedure_encoded'].unique()
    unique_procedures = le_procedure.inverse_transform(test_procedure_codes)
    
    # Limit to n_procedures if specified
    if n_procedures is not None:
        unique_procedures = unique_procedures[:n_procedures]
    
    results = {}
    feature_names = ['procedure_encoded', 'material_encoded', 'action_encoded', 'specialty_encoded']
    
    # Process each procedure in the test set
    for procedure in unique_procedures:
        procedure_encoded = le_procedure.transform([procedure])[0]
        procedure_data = df[df['procedure_encoded'] == procedure_encoded]
        
        # Filter to only rows in the test set
        procedure_data_test = procedure_data[procedure_data.index.isin(X_test.index)]
        
        if len(procedure_data_test) == 0:
            continue  # Skip if no test data for this procedure
        
        procedure_materials = []
        for _, row in procedure_data_test.iterrows():
            # Create a DataFrame for prediction with feature names in correct order
            input_data = pd.DataFrame([[row['procedure_encoded'], 
                                       row['material_encoded'], 
                                       row['action_encoded'], 
                                       row['specialty_encoded']]],
                                     columns=feature_names)
            
            material_info = {
                'material_name': le_material.inverse_transform([row['material_encoded']])[0],
                'actual_price': round(float(row['material_price']), 2),
                'predicted_price': round(float(model.predict(input_data)[0]), 2),
                'surgeon_specific_action': le_action.inverse_transform([row['action_encoded']])[0],
                'specialty': le_specialty.inverse_transform([row['specialty_encoded']])[0]
            }
            procedure_materials.append(material_info)
        
        results[procedure] = procedure_materials
    
    return results, X_test, y_test, model

In [43]:
try:
    # Get materials for only the first 2 procedures in the test set
    n_procedures_to_show = 2  # Set to 1 for first procedure only, 2 for first two
    all_procedures_data, X_test, y_test, model = list_all_procedure_materials(cleaned_data, n_procedures=n_procedures_to_show)
    
    # Print results
    print(f"Materials for the first {n_procedures_to_show} procedure(s) in the test set:")
    print("==========================================")
    
    for procedure, materials in all_procedures_data.items():
        print(f"\nProcedure: {procedure}")
        print("-----------------")
        for material in materials:
            print(f"Material: {material['material_name']} | Actual Price: ${material['actual_price']} | Predicted Price: ${material['predicted_price']} | Surgeon Specific Action: {material['surgeon_specific_action']} | Specialty: {material['specialty']}")
            
    # Calculate and print model performance on test set
    feature_names = ['procedure_encoded', 'material_encoded', 'action_encoded', 'specialty_encoded']
    test_score = model.score(X_test, y_test)
    print(f"\nModel R² Score on test set: {test_score:.4f}")

except Exception as e:
    print(f"An error occurred: {str(e)}")

Materials for the first 2 procedure(s) in the test set:

Procedure: Breast Reduction
-----------------
Material: 10f blake drain | Actual Price: $180.17 | Predicted Price: $159.45 | Surgeon Specific Action: DEFAULT | Specialty: DEFAULT
Material: general medium pack | Actual Price: $9.97 | Predicted Price: $36.51 | Surgeon Specific Action: DEFAULT | Specialty: DEFAULT
Material: yankauer suction | Actual Price: $175.17 | Predicted Price: $147.72 | Surgeon Specific Action: DEFAULT | Specialty: DEFAULT
Material: blue surgipad | Actual Price: $156.31 | Predicted Price: $154.2 | Surgeon Specific Action: DEFAULT | Specialty: DEFAULT
Material: camera sleeve | Actual Price: $156.31 | Predicted Price: $154.77 | Surgeon Specific Action: DEFAULT | Specialty: DEFAULT
Material: 2-0 perma-hand silk 1689h | Actual Price: $112.02 | Predicted Price: $130.85 | Surgeon Specific Action: DEFAULT | Specialty: DEFAULT
Material: yankauer suction | Actual Price: $175.17 | Predicted Price: $147.72 | Surgeon Spec

**variation**

In [48]:
def list_all_procedure_materials(cleaned_data, n_procedures=None):
    # Get trained model and necessary components
    model, X_train, X_test, y_train, y_test, df, le_procedure, le_material, le_action, le_specialty = train_predictive_model(cleaned_data)
    
    # Get unique procedures from the test set
    test_df = pd.DataFrame(X_test, columns=X_test.columns)
    test_df['procedure_encoded'] = test_df['procedure_encoded'].astype(int)
    test_procedure_codes = test_df['procedure_encoded'].unique()
    unique_procedures = le_procedure.inverse_transform(test_procedure_codes)
    
    # Limit to n_procedures if specified
    if n_procedures is not None:
        unique_procedures = unique_procedures[:n_procedures]
    
    results = {}
    feature_names = ['procedure_encoded', 'material_encoded', 'action_encoded', 'specialty_encoded']
    
    # Process each procedure in the test set
    for procedure in unique_procedures:
        procedure_encoded = le_procedure.transform([procedure])[0]
        procedure_data = df[df['procedure_encoded'] == procedure_encoded]
        
        # Filter to only rows in the test set
        procedure_data_test = procedure_data[procedure_data.index.isin(X_test.index)]
        
        if len(procedure_data_test) == 0:
            continue  # Skip if no test data for this procedure
        
        # Group by material to count occurrences and check price variation
        material_groups = procedure_data_test.groupby('material_name')
        
        procedure_materials = []
        for material_name, group in material_groups:
            occurrences = len(group)
            prices = group['material_price'].tolist()
            price_variation = "Yes" if len(set(prices)) > 1 else "No"  # Check if prices differ
            
            for _, row in group.iterrows():
                # Create a DataFrame for prediction
                input_data = pd.DataFrame([[row['procedure_encoded'], 
                                           row['material_encoded'], 
                                           row['action_encoded'], 
                                           row['specialty_encoded']]],
                                         columns=feature_names)
                
                # Check if surgeon action is DEFAULT
                action_status = "Default" if row['surgeon_specific_action'] == 'DEFAULT' else "Added"
                
                material_info = {
                    'material_name': material_name,
                    'actual_price': round(float(row['material_price']), 2),
                    'predicted_price': round(float(model.predict(input_data)[0]), 2),
                    'surgeon_specific_action': le_action.inverse_transform([row['action_encoded']])[0],
                    'specialty': le_specialty.inverse_transform([row['specialty_encoded']])[0],
                    'occurrences': occurrences,
                    'price_variation': price_variation,
                    'action_status': action_status
                }
                procedure_materials.append(material_info)
        
        results[procedure] = procedure_materials
    
    return results, X_test, y_test, model

try:
    # Get materials for only the first 2 procedures in the test set
    n_procedures_to_show = 2  # Set to 1 for first procedure only, 2 for first two
    all_procedures_data, X_test, y_test, model = list_all_procedure_materials(cleaned_data, n_procedures=n_procedures_to_show)
    
    # Print results
    print(f"Materials for the first {n_procedures_to_show} procedure(s) in the test set:")
    print("==================================================================================")
    
    for procedure, materials in all_procedures_data.items():
        print(f"\nProcedure: {procedure}")
        print("----------------------------------------------------------------------------------")
        for material in materials:
            print(f"Material: {material['material_name']} | Occurrences: {material['occurrences']} | Actual Price: ${material['actual_price']} | Price Variation: {material['price_variation']} | Surgeon Specific Action: {material['surgeon_specific_action']}")
    
    # Calculate and print model performance on test set
    feature_names = ['procedure_encoded', 'material_encoded', 'action_encoded', 'specialty_encoded']
    score = model.score(X_test, y_test)
    print(f"\nModel R² Score on test set: {score:.4f}")

except Exception as e:
    print(f"An error occurred: {str(e)}")

Materials for the first 2 procedure(s) in the test set:

Procedure: Breast Reduction
----------------------------------------------------------------------------------
Material: 10f blake drain | Occurrences: 1 | Actual Price: $180.17 | Price Variation: No | Surgeon Specific Action: DEFAULT
Material: 14f blake drain | Occurrences: 4 | Actual Price: $180.17 | Price Variation: Yes | Surgeon Specific Action: DEFAULT
Material: 14f blake drain | Occurrences: 4 | Actual Price: $174.17 | Price Variation: Yes | Surgeon Specific Action: DEFAULT
Material: 14f blake drain | Occurrences: 4 | Actual Price: $175.17 | Price Variation: Yes | Surgeon Specific Action: DEFAULT
Material: 14f blake drain | Occurrences: 4 | Actual Price: $175.17 | Price Variation: Yes | Surgeon Specific Action: DEFAULT
Material: 2-0 monocryl plus mcp3440g | Occurrences: 2 | Actual Price: $102.02 | Price Variation: Yes | Surgeon Specific Action: DEFAULT
Material: 2-0 monocryl plus mcp3440g | Occurrences: 2 | Actual Price: $1

**total cost with default and added prices**

In [116]:
from sklearn.metrics import mean_absolute_error

def list_all_procedure_materials(cleaned_data, n_procedures=None):
    # Get trained model and necessary components
    model, X_train, X_test, y_train, y_test, df, le_procedure, le_material, le_action, le_specialty = train_predictive_model(cleaned_data)
    
    # Get unique procedures from the test set
    test_df = pd.DataFrame(X_test, columns=X_test.columns)
    test_df['procedure_encoded'] = test_df['procedure_encoded'].astype(int)
    test_procedure_codes = test_df['procedure_encoded'].unique()
    unique_procedures = le_procedure.inverse_transform(test_procedure_codes)
    
    # Limit to n_procedures if specified
    if n_procedures is not None:
        unique_procedures = unique_procedures[:n_procedures]
    
    results = {}
    feature_names = ['procedure_encoded', 'material_encoded', 'action_encoded', 'specialty_encoded']
    
    # Process each procedure in the test set
    for procedure in unique_procedures:
        procedure_encoded = le_procedure.transform([procedure])[0]
        procedure_data = df[df['procedure_encoded'] == procedure_encoded]
        
        # Filter to only rows in the test set
        procedure_data_test = procedure_data[procedure_data.index.isin(X_test.index)]
        
        if len(procedure_data_test) == 0:
            continue  # Skip if no test data for this procedure
        
        # Group by material to analyze unique materials
        material_groups = procedure_data_test.groupby('material_name')
        
        procedure_materials = []
        default_total_cost = 0.0  # Total cost with only default materials
        total_cost = 0.0  # Total cost including all materials
        
        for material_name, group in material_groups:
            occurrences = len(group)
            prices = group['material_price'].tolist()
            price_variation = "Yes" if len(set(prices)) > 1 else "No"
            price_difference = round(max(prices) - min(prices), 2) if price_variation == "Yes" else 0.0
            
            # Calculate costs
            for _, row in group.iterrows():
                actual_price = round(float(row['material_price']), 2)
                total_cost += actual_price
                if row['surgeon_specific_action'] == 'DEFAULT':
                    default_total_cost += actual_price
                
                # Prediction for the first occurrence of each material (for consistency)
                if len(procedure_materials) < len(material_groups) or material_name not in [m['material_name'] for m in procedure_materials]:
                    input_data = pd.DataFrame([[row['procedure_encoded'], 
                                               row['material_encoded'], 
                                               row['action_encoded'], 
                                               row['specialty_encoded']]],
                                             columns=feature_names)
                    material_info = {
                        'material_name': material_name,
                        'occurrences': occurrences,
                        'actual_prices': [round(p, 2) for p in prices],  # List of all prices
                        'predicted_price': round(float(model.predict(input_data)[0]), 2),
                        'price_variation': price_variation,
                        'price_difference': price_difference,
                        'specialty': le_specialty.inverse_transform([row['specialty_encoded']])[0]
                    }
                    procedure_materials.append(material_info)
        
        results[procedure] = {
            'materials': procedure_materials,
            'default_total_cost': round(default_total_cost, 2),
            'total_cost': round(total_cost, 2)
        }
    
    return results, X_test, y_test, model

try:
    # Get materials for only the first 2 procedures in the test set
    n_procedures_to_show = 2  # Set to 1 for first procedure only, 2 for first two
    all_procedures_data, X_test, y_test, model = list_all_procedure_materials(cleaned_data, n_procedures=n_procedures_to_show)
    
    # Print results
    print(f"Materials for the first {n_procedures_to_show} procedure(s) in the test set:")
    print("==================================================================================")
    
    for procedure, data in all_procedures_data.items():
        print(f"\nProcedure: {procedure}")
        print("----------------------------------------------------------------------------------")
        for material in data['materials']:
            print(f"Material: {material['material_name']} | Occurrences: {material['occurrences']} | Actual Prices: {material['actual_prices']} | Price Variation: {material['price_variation']} | Price Difference: ${material['price_difference']} | Specialty: {material['specialty']}")
        print(f"Default Procedure Total Cost (only default materials): ${data['default_total_cost']}")
        print(f"Total Procedure Cost (including added materials): ${data['total_cost']}")
        print("----------------------------------------------------------------------------------")
    
    # Calculate and print model performance on test set
    feature_names = ['procedure_encoded', 'material_encoded', 'action_encoded', 'specialty_encoded']
    score = model.score(X_test, y_test)
    print(f"\nModel R² Score on test set: {score:.4f}")
    
    # Predict and calculate MAE
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Mean Absolute Error on test set: ${mae:.2f}")

except Exception as e:
    print(f"An error occurred: {str(e)}")



Materials for the first 2 procedure(s) in the test set:

Procedure: Breast Reduction
----------------------------------------------------------------------------------
Material: 10f blake drain | Occurrences: 1 | Actual Prices: [180.17] | Price Variation: No | Price Difference: $0.0 | Specialty: DEFAULT
Material: 14f blake drain | Occurrences: 4 | Actual Prices: [180.17, 174.17, 175.17, 175.17] | Price Variation: Yes | Price Difference: $6.0 | Specialty: DEFAULT
Material: 14f blake drain | Occurrences: 4 | Actual Prices: [180.17, 174.17, 175.17, 175.17] | Price Variation: Yes | Price Difference: $6.0 | Specialty: DEFAULT
Material: 14f blake drain | Occurrences: 4 | Actual Prices: [180.17, 174.17, 175.17, 175.17] | Price Variation: Yes | Price Difference: $6.0 | Specialty: DEFAULT
Material: 14f blake drain | Occurrences: 4 | Actual Prices: [180.17, 174.17, 175.17, 175.17] | Price Variation: Yes | Price Difference: $6.0 | Specialty: DEFAULT
Material: 2-0 monocryl plus mcp3440g | Occurren

In [80]:
def list_all_procedure_materials(cleaned_data, n_procedures=None):
    # Get trained model and necessary components
    model, X_train, X_test, y_train, y_test, df, le_procedure, le_material, le_action, le_specialty = train_predictive_model(cleaned_data)
    
    # Get unique procedures from the test set
    test_df = pd.DataFrame(X_test, columns=X_test.columns)
    test_df['procedure_encoded'] = test_df['procedure_encoded'].astype(int)
    test_procedure_codes = test_df['procedure_encoded'].unique()
    unique_procedures = le_procedure.inverse_transform(test_procedure_codes)
    
    # Limit to n_procedures if specified
    if n_procedures is not None:
        unique_procedures = unique_procedures[:n_procedures]
    
    results = {}
    feature_names = ['procedure_encoded', 'material_encoded', 'action_encoded', 'specialty_encoded']
    
    # Process each procedure in the test set
    for procedure in unique_procedures:
        procedure_encoded = le_procedure.transform([procedure])[0]
        procedure_data = df[df['procedure_encoded'] == procedure_encoded]
        
        # Filter to only rows in the test set
        procedure_data_test = procedure_data[procedure_data.index.isin(X_test.index)]
        
        if len(procedure_data_test) == 0:
            continue  # Skip if no test data for this procedure
        
        # Group by material to analyze unique materials
        material_groups = procedure_data_test.groupby('material_name')
        
        procedure_materials = []
        default_total_cost = 0.0
        total_cost = 0.0
        
        for material_name, group in material_groups:
            occurrences = len(group)
            prices = group['material_price'].tolist()
            actual_prices = [round(p, 2) for p in prices]
            price_variation = "Yes" if len(set(prices)) > 1 else "No"
            price_difference = round(max(prices) - min(prices), 2) if price_variation == "Yes" else 0.0
            
            # Find the lowest price and its action status
            min_price = min(prices)
            min_price_row = group[group['material_price'] == min_price].iloc[0]
            lowest_price_status = "Default" if min_price_row['surgeon_specific_action'] == 'DEFAULT' else "Added"
            
            # Prediction for the material (using first row's features)
            first_row = group.iloc[0]
            input_data = pd.DataFrame([[first_row['procedure_encoded'], 
                                       first_row['material_encoded'], 
                                       first_row['action_encoded'], 
                                       first_row['specialty_encoded']]],
                                     columns=feature_names)
            
            material_info = {
                'material_name': material_name,
                'occurrences': occurrences,
                'actual_prices': actual_prices,
                'price_variation': price_variation,
                'price_difference': price_difference,
                'lowest_price_status': lowest_price_status,
                'predicted_price': round(float(model.predict(input_data)[0]), 2),
                'specialty': le_specialty.inverse_transform([first_row['specialty_encoded']])[0]
            }
            procedure_materials.append(material_info)
            
            # Update total costs
            total_cost += sum(prices)
            default_group = procedure_data_test[procedure_data_test['surgeon_specific_action'] == 'DEFAULT']
            unique_default_materials = default_group.groupby('material_name').first()
            default_total_cost += unique_default_materials['material_price'].sum()

        
        results[procedure] = {
            'materials': procedure_materials,
            'default_total_cost': round(default_total_cost, 2),
            'total_cost': round(total_cost, 2)
        }
    
    return results, X_test, y_test, model

try:
    n_procedures_to_show = 1
    all_procedures_data, X_test, y_test, model = list_all_procedure_materials(cleaned_data, n_procedures=n_procedures_to_show)
    
    # Print results
    print(f"Materials for the first {n_procedures_to_show} procedure(s) in the test set:")
    print("==================================================================================")
    
    for procedure, data in all_procedures_data.items():
        print(f"\nProcedure: {procedure}")
        print("----------------------------------------------------------------------------------")
        for material in data['materials']:
            print(f"Material: {material['material_name']} | Occurrences: {material['occurrences']} | Actual Prices: {material['actual_prices']} | Price Variation: {material['price_variation']} | Price Difference: ${material['price_difference']} | Lowest Price Status: {material['lowest_price_status']}")
        print(f"Default Procedure Total Cost (only default materials): ${data['default_total_cost']}")
        print(f"Total Procedure Cost (including added materials): ${data['total_cost']}")
        print("----------------------------------------------------------------------------------")
    
    # Calculate and print model performance on test set
    feature_names = ['procedure_encoded', 'material_encoded', 'action_encoded', 'specialty_encoded']
    score = model.score(X_test, y_test)
    print(f"\nModel R² Score on test set: {score:.4f}")

except Exception as e:
    print(f"An error occurred: {str(e)}")

  return fit_method(estimator, *args, **kwargs)


An error occurred: not enough values to unpack (expected 10, got 3)
