# Importing necessory libraries and modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import ElasticNetCV


KeyboardInterrupt: 

In [None]:
# Read the CSV file "insurance.csv" 
data = pd.read_csv("insurance.csv")

In [None]:
data.head()

In [None]:
# Check for missing values in each column and print the sum of missing values
(data.isnull().sum())

# Finding unique values

In [None]:
# Count the occurrences of each unique value in the 'sex' column
(data['sex'].value_counts())

In [None]:
# Count the occurrences of each unique value in the 'smoker' column
(data['smoker'].value_counts())

In [None]:
# Count the occurrences of each unique value in the 'region' column
data['region'].value_counts()

# Data Visualization

In [None]:
# Count the occurrences of each unique value in the 'sex' column and plot a bar chart
data['sex'].value_counts().plot(kind='bar', figsize=(8, 5))

# Set labels and title
plt.xlabel('Sex')
plt.ylabel('Count')
plt.title('Distribution of Sex')

# Show the plot
plt.show()

In [None]:
# Count the occurrences of each unique value in the 'smoker' column and plot a bar chart
data['smoker'].value_counts().plot(kind='bar', figsize=(8, 5))

# Set labels and title
plt.xlabel('Smoker')
plt.ylabel('Count')
plt.title('Distribution of Smokers')
plt.show()

In [None]:
# Count the occurrences of each unique value in the 'region' column and plot a bar chart
data['region'].value_counts().plot(kind='bar', figsize=(8, 5))

# Set labels and title
plt.xlabel('Region')
plt.ylabel('Count')
plt.title('Distribution of Regions')

# Show the plot
plt.show()

In [None]:
data['children'].value_counts().plot(kind = 'bar')

# Data Prepropcessing

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a sample dataframe with categorical data
sexx = pd.DataFrame({'sex': ['male', 'female']})
smokerr = pd.DataFrame({'smoker': ['no', 'yes']})
regionn = pd.DataFrame({'region': ['southeast', 'southwest','northeast',' northwest']})

print(f"Before Encoding the Data:\n\n{sexx}\n")
print(f"Before Encoding the Data:\n\n{smokerr}\n")
print(f"Before Encoding the Data:\n\n{regionn}\n")

In [None]:
from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder object
encoder = LabelEncoder()

# Encode the categorical  column
data['sex'] = encoder.fit_transform(data['sex'])
data['smoker'] = encoder.fit_transform(data['smoker'])
data['region'] = encoder.fit_transform(data['region'])

In [None]:
data.tail()

In [None]:
# Define a function to categorize BMI
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:
        return 'Obesity'

# Apply the function to the 'bmi' column and create a new column 'BMI_category'
data['BMI_category'] = data['bmi'].apply(categorize_bmi)

In [None]:
data['BMI_category'].value_counts()

In [None]:
# Map the categorical values in 'BMI_category' to numerical values
data['BMI_category'] = data['BMI_category'].map({'Obesity': 1, 'Overweight': 2, 'Normal': 3, 'Underweight': 4})

In [None]:
data.head(15)

# Outliers Removal

In [None]:
# Calculate the 95th percentile as the upper limit
upper_limit = data['bmi'].quantile(0.95)

# Calculate the 5th percentile as the lower limit
lower_limit = data['bmi'].quantile(0.05)


In [None]:
# Apply the condition to the 'bmi' column and replace values outside the limits
data['bmi'] = np.where(data['bmi'] >= upper_limit, upper_limit,
                       np.where(data['bmi'] <= lower_limit, lower_limit, data['bmi']))

# Splitting the Dataset and Feature Selection

In [None]:
# Split the data into features (x) and target variable (y)
x = data.drop(columns=['charges'])  # Features
y = data['charges']  # Target variable

In [None]:
selector = SelectKBest(f_regression, k=7).fit(x,y)

# Get the boolean mask indicating selected features
features_mask = selector.get_support()

# Get the names of the selected features
features = x.columns[features_mask]

# Sort the DataFrame by importance in ascending order
pd.DataFrame({ 'Feature_name': features, 'Importance': selector.scores_[features_mask]}).sort_values(by='Importance', ascending=False)

In [None]:

# Drop the 'region' column as it has the less feature importance
x = x.drop(columns=['region'])

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train , x_test, y_train , y_test = train_test_split( x ,y , test_size = 0.2 , random_state = 0)


# Model Building 

In [None]:
from sklearn.ensemble import RandomForestRegressor
# Create a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, bootstrap=True, max_depth=4)

# Train the model on the training data
model.fit(x_train, y_train)

In [None]:
# testing the model on test data
predicted = model.predict(x_test)

In [None]:
# Scores of Model
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(predicted , y_test)


In [None]:
# Model Accuracy
from sklearn.metrics import r2_score
print("Accuracy : " , r2_score(predicted , y_test))

In [None]:

# Training Cross Validation model
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model , x_train ,y_train ,cv = 10)
print(scores)


In [None]:
parameters = {
    'max_depth' : [2,4,6,8,10],
    'max_samples': [0.1,0.3,0.6,1.0],
            
    }
     

In [None]:
# Create a GridSearchCV object
model_search = GridSearchCV(model, parameters, cv=10, n_jobs=-1)

# Perform hyperparameter tuning
model_search.fit(x_train, y_train)

In [None]:
# Access the best parameters found by GridSearchCV
best_params = model_search.best_params_
print(best_params)
# Access the best score achieved by GridSearchCV
best_score = model_search.best_score_
print(best_score)

In [None]:
pred = model_search.predict(x_test)

In [None]:
r2_score(pred , y_test)

In [None]:
# Define a Gradient Boosting Regressor model (hyperparameters can be tuned later)
model1 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.8, max_depth=2)

# Train the model on the training data (x_train, y_train)
model1.fit(x_train, y_train)

# Predict target values for the test data (x_test)
y_pred = model1.predict(x_test)

model1.score(x_train, y_train),r2_score(y_test, y_pred)

# Trying Another Models


In [None]:
from sklearn.neighbors import KNeighborsRegressor
# Create a K-Nearest Neighbors Regressor model
neigh = KNeighborsRegressor(n_neighbors=2)

# Train the KNN model on the training data
neigh.fit(x_train, y_train)

y_pred2 = model.predict(x_test)
neigh.score(x_train, y_train),r2_score(y_test, y_pred2)

In [None]:
from sklearn.kernel_ridge import KernelRidge
# Create a Kernel Ridge Regression model
krr = KernelRidge(alpha=1.0)  # Adjust alpha for regularization

# Train the KRR model on the training data
krr.fit(x_train, y_train)

# Predict target values for the test data using the KRR model
y_pred3 = krr.predict(x_test)

krr.score(x_train, y_train),r2_score(y_test, y_pred3)

# Save the best model which has the maximum scores

In [None]:
import pickle

In [None]:
#Save the model using the pickle module to the working directory
pickle.dump(model1 , open('med_model2.pkl' , "wb"))