1 Exploratory Data Analysis
1.1 Understanding the Data

In [54]:


# Evaluating the model
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse}")

# Predicting charges for new entries (example)
new_data = pd.DataFrame({
    'age': [10],
    'sex': ['male'],
    'bmi': [30.0],
    'children': [0],
    'smoker': ['no'],
    'region': ['northwest']
})

# Predicting with the pipeline
predicted_charge = pipeline.predict(new_data)
print(f"Predicted charges for new entry: {predicted_charge[0]}")


Mean Squared Error on Test Set: 39933194.54805147
Predicted charges for new entry: 409.9077702359464


In [49]:
print(data.columns)


Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')


In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

# Load data
data = pd.read_csv('medical_insurance.csv')

# Display the first few rows to understand the structure
print(data.head())

# Define categorical and numerical columns
cat_cols = ['sex', 'smoker', 'region']
num_cols = ['age', 'bmi', 'children']

# Split data into features (X) and target (y)
X = data.drop('charges', axis=1)
y = data['charges']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ])

# Define the model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse}")

# Save the trained model and preprocessor
joblib.dump(pipeline, 'model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')




   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
Mean Squared Error on Test Set: 39933194.54805147


['preprocessor.pkl']

In [52]:
import joblib

# Load trained model and preprocessor
model = joblib.load('model.pkl')
preprocessor = joblib.load('preprocessor.pkl')
