In [None]:
# final_streamlit_app.py

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# ---------------- Load dataset ----------------
file_path = r"C:\Users\MTMC\OneDrive - Manipal Academy of Higher Education\Desktop\Medical Insurance cost prediction.xlsm"
df = pd.read_excel(file_path, engine="openpyxl")

# ---------------- Preprocess categorical variables ----------------
df['sex'] = df['sex'].map({'female':1, 'male':0})
df['smoker'] = df['smoker'].map({'yes':1, 'no':0})
region_dummies = pd.get_dummies(df['region'], drop_first=True)
df = pd.concat([df.drop('region', axis=1), region_dummies], axis=1)

# Features and target
X = df.drop('charges', axis=1)
y = df['charges']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------- Train models and create comparison table ----------------
def check_overfitting(train_r2, test_r2):
    return "Yes" if (train_r2 - test_r2) > 0.05 else "No"

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

results = []
trained_models = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    trained_models[name] = model  # store trained model
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    overfit = check_overfitting(train_r2, test_r2)
    
    results.append([name, round(train_rmse,2), round(test_rmse,2),
                    round(train_r2,2), round(test_r2,2), overfit])

comparison_df = pd.DataFrame(results, columns=["Model", "Train RMSE", "Test RMSE",
                                               "Train R²", "Test R²", "Overfitting"])

# ---------------- Streamlit UI ----------------
st.title("Medical Insurance Cost Prediction")

# Model Comparison Table
st.header("Model Comparison Table")
st.dataframe(comparison_df)

# ---------------- Prediction UI ----------------
st.header("Predict Insurance Cost")

age = st.number_input("Age", min_value=0, max_value=120, value=30)
sex = st.selectbox("Sex", ["male", "female"])
bmi = st.number_input("BMI", min_value=10.0, max_value=50.0, value=25.0)
children = st.number_input("Number of Children", min_value=0, max_value=10, value=0)
smoker = st.selectbox("Smoker", ["yes", "no"])
region = st.selectbox("Region", ["northeast", "southeast", "southwest", "northwest"])
selected_model_name = st.selectbox("Select Model", ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting"])

if st.button("Predict"):
    # Encode categorical variables
    sex_female = 1 if sex == "female" else 0
    smoker_yes = 1 if smoker == "yes" else 0
    region_northwest = 1 if region == "northwest" else 0
    region_southeast = 1 if region == "southeast" else 0
    region_southwest = 1 if region == "southwest" else 0

    # Prepare input array
    input_data = np.array([[age, bmi, children, sex_female, smoker_yes,
                            region_northwest, region_southeast, region_southwest]])

    # Scale numeric features
    input_scaled = scaler.transform(input_data)

    # Predict using selected model
    prediction = trained_models[selected_model_name].predict(input_scaled)[0]

    st.success(f"Predicted Insurance Cost using {selected_model_name}: ${prediction:.2f}")
