In [None]:
# corrected_pipeline.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 1) Load dataset (update path if necessary)
df = pd.read_csv("insurance_corrupted.csv")   # <- same filename your notebook used

# 2) Quick sanity checks
print("shape:", df.shape)
print(df.dtypes)
print(df.head())
print("missing-values:\n", df.isnull().sum())

# 3) Identify features and target
# Assumes last column is the target as in your notebook; change 'charges' if the column name exists.
if 'charges' in df.columns:
    target_col = 'charges'
else:
    # fallback: use last column
    target_col = df.columns[-1]
print("Using target column:", target_col)

X = df.drop(columns=[target_col])
y = df[target_col].astype(float)   # make sure target is numeric float

# 4) Inspect target distribution
print("target dtype:", y.dtype)
print("target summary:\n", y.describe())
sns.histplot(y, kde=True)
plt.title("Target distribution")
plt.show()

# 5) Categorical columns detection (example columns commonly in insurance dataset)
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print("categorical columns:", cat_cols)
print("numeric columns:", num_cols)

# 6) Build ColumnTransformer:
#    - OneHotEncode categorical columns (drop='first' optional to avoid collinearity)
#    - StandardScale numeric columns
preprocessor = ColumnTransformer(
    transformers=[
        ("oh", OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), cat_cols),
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder='drop'  # all columns covered
)

# 7) Transform features
X_transformed = preprocessor.fit_transform(X)
print("Transformed feature shape:", X_transformed.shape)

# 8) Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.2, random_state=42
)

# 9) Fit linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# 10) Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"Root MSE: {rmse:.4f}")
print(f"R^2: {r2:.4f}")

# 11) Diagnostics
# Residuals
residuals = y_test - y_pred
plt.figure(figsize=(8,5))
plt.scatter(y_pred, residuals, alpha=0.6)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.title("Residual plot")
plt.show()

# True vs Predicted
plt.figure(figsize=(7,7))
plt.scatter(y_test, y_pred, alpha=0.6)
lims = [min(y_test.min(), y_pred.min()), max(y_test.max(), y_pred.max())]
plt.plot(lims, lims, linestyle='--', color='red')
plt.xlabel("True")
plt.ylabel("Predicted")
plt.title("True vs Predicted")
plt.show()

# 12) Feature names (optional) â€” build readable column names for the transformed features
oh_columns = []
if len(cat_cols) > 0:
    oh = preprocessor.named_transformers_['oh']
    try:
        oh_cols = oh.get_feature_names_out(cat_cols)
    except:
        # sklearn older versions:
        oh_cols = oh.get_feature_names(cat_cols)
    oh_columns = list(oh_cols)

feature_names = oh_columns + list(num_cols)
print("Number of features after transform:", len(feature_names))
print(feature_names[:50])
