In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("merged_prices.csv")

# Function to extract numeric price from strings
def extract_price(price_str):
    if pd.isnull(price_str):
        return np.nan  # Keep NaN values unchanged
    price_numbers = re.findall(r"\d+\.\d+", str(price_str))  # Extract numbers with decimal points
    return float(price_numbers[-1]) if price_numbers else np.nan  # Take the last price if multiple

# Clean the price column
df["price"] = df["price"].apply(extract_price)

# Identify missing values in price column
missing_price_mask = df["price"].isnull()

# Select features that could help predict price
features = ["material_type", "material_subtype", "manufacturer", "procedure_name", "specialty"]

# Encode categorical features
label_encoders = {}
for col in features:
    le = LabelEncoder()
    df[col] = df[col].astype(str)  # Convert to string before encoding
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Create training data (only rows with known price)
train_data = df[~missing_price_mask]
X_train = train_data[features]
y_train = train_data["price"]

# Create test data (only rows with missing price)
test_data = df[missing_price_mask]
X_test = test_data[features]

# Train a regression model to predict price
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict missing prices
predicted_prices = model.predict(X_test)

# Fill missing prices in the dataset
df.loc[missing_price_mask, "price"] = predicted_prices

# Save the updated dataset
df.to_csv("dataset_imputed.csv", index=False)

print("Regression-based imputation completed. Missing prices filled.")


  df = pd.read_csv("merged_prices.csv")


Regression-based imputation completed. Missing prices filled.


In [6]:
import pandas as pd
import numpy as np
import re

# Load dataset
df = pd.read_csv("merged_prices.csv")

# Function to extract numeric price from strings
def extract_price(price_str):
    if pd.isnull(price_str):
        return np.nan  # Keep NaN values unchanged
    price_numbers = re.findall(r"\d+\.\d+", str(price_str))  # Extract numbers with decimal points
    return float(price_numbers[-1]) if price_numbers else np.nan  # Take the last price if multiple

# Clean the price column
df["price"] = df["price"].apply(extract_price)

# Compute the median price (ignoring NaN values)
median_price = df["price"].median()

# Fill missing prices with the median price
df["price"].fillna(median_price, inplace=True)

# Save the updated dataset
df.to_csv("dataset_median_imputed.csv", index=False)

print(f"Median imputation completed. Missing prices filled with median value: {median_price:.2f}")


  df = pd.read_csv("merged_prices.csv")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["price"].fillna(median_price, inplace=True)


Median imputation completed. Missing prices filled with median value: 26.49


In [8]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("merged_prices.csv")

# Function to extract numeric price from strings
def extract_price(price_str):
    if pd.isnull(price_str):
        return np.nan  # Keep NaN values unchanged
    price_numbers = re.findall(r"\d+\.\d+", str(price_str))  # Extract numbers with decimal points
    return float(price_numbers[-1]) if price_numbers else np.nan  # Take the last price if multiple

# Clean the price column
df["price"] = df["price"].apply(extract_price)

# Drop rows where price is still NaN (if no valid numbers were extracted)
df.dropna(subset=["price"], inplace=True)

# Create a test set by randomly removing 20% of known prices
np.random.seed(42)
missing_mask = np.random.rand(len(df)) < 0.2
df_missing = df.copy()
df_missing.loc[missing_mask, "price"] = np.nan  # Set these values as missing

# Store actual prices for evaluation
actual_prices = df.loc[missing_mask, "price"]

# ---------- METHOD 1: MEDIAN IMPUTATION ----------
median_price = df_missing["price"].median()
df_missing["price_median"] = df_missing["price"].fillna(median_price)

# ---------- METHOD 2: REGRESSION-BASED IMPUTATION ----------
# Select features that could help predict price
features = ["material_type", "material_subtype","procedure_name", "specialty"]

# Encode categorical features
label_encoders = {}
for col in features:
    le = LabelEncoder()
    df_missing[col] = df_missing[col].astype(str)  # Convert to string before encoding
    df_missing[col] = le.fit_transform(df_missing[col])
    label_encoders[col] = le

# Split data into train (non-missing) and test (missing)
train_data = df_missing[~df_missing["price"].isnull()]
X_train = train_data[features]
y_train = train_data["price"]

test_data = df_missing[df_missing["price"].isnull()]
X_test = test_data[features]

# Train regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict missing prices
df_missing.loc[df_missing["price"].isnull(), "price_regression"] = model.predict(X_test)

# ---------- EVALUATION ----------
# Extract imputed values for test set
imputed_median = df_missing.loc[missing_mask, "price_median"]
imputed_regression = df_missing.loc[missing_mask, "price_regression"]

# Calculate evaluation metrics
mae_median = mean_absolute_error(actual_prices, imputed_median)
rmse_median = np.sqrt(mean_squared_error(actual_prices, imputed_median))

mae_regression = mean_absolute_error(actual_prices, imputed_regression)
rmse_regression = np.sqrt(mean_squared_error(actual_prices, imputed_regression))

# Print results
print("===== IMPUTATION METHOD COMPARISON =====")
print(f"Median Imputation - MAE: {mae_median:.2f}, RMSE: {rmse_median:.2f}")
print(f"Regression Imputation - MAE: {mae_regression:.2f}, RMSE: {rmse_regression:.2f}")

# Determine best method
if mae_regression < mae_median:
    print("✅ Regression imputation performed better.")
else:
    print("✅ Median imputation performed better.")


  df = pd.read_csv("merged_prices.csv")


===== IMPUTATION METHOD COMPARISON =====
Median Imputation - MAE: 13.02, RMSE: 35.72
Regression Imputation - MAE: 2.81, RMSE: 22.01
✅ Regression imputation performed better.
