<a href="https://colab.research.google.com/github/Aarushi1512006/insurance-pricing-analysis/blob/main/insurance_cost_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('/content/insurance.csv')

In [None]:
df

# ***EDA***

In [None]:
df.shape

In [None]:
df.head()

In [None]:

df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
numeric_columns = ['age', 'bmi', 'children', 'charges']
for col in numeric_columns:
  plt.figure(figsize=(6,4)) #600 X 400 pixels
  sns.histplot(df[col],kde = True, bins =20)

In [None]:
sns.countplot(x = df['children'])

In [None]:
sns.countplot(x=df['sex'])

In [None]:
sns.countplot(x=df['smoker'])

In [None]:
for col in numeric_columns:
  plt.figure(figsize=(6,4))
  sns.boxplot(x=df[col])

In [None]:
plt.figure(figsize = (8,6))
sns.heatmap(df.corr(numeric_only=True), annot=True) # tells correlation with each other
# like age and charges = 0.3 means correlated but childern and age=0.042 means they are not very much corelated

#Data Cleaning and Preprocessing

In [None]:
df_cleaned = df.copy()

In [None]:
df_cleaned.head()

In [None]:
df_cleaned.shape

In [None]:
df_cleaned.drop_duplicates(inplace=True)

In [None]:
df_cleaned.shape

In [None]:
df_cleaned.isnull().sum()

In [None]:
df_cleaned.dtypes

In [None]:
df_cleaned['sex'].value_counts() # from this we find if we have Male, MALE
# but here all are male and female

In [None]:
df_cleaned['sex'] = df_cleaned['sex'].map({"male" : 0, "female" : 1}) #Label Encoding

In [None]:
df_cleaned.head()

In [None]:
df_cleaned['smoker'].value_counts()

In [None]:
df_cleaned['smoker'] = df_cleaned['smoker'].map({"yes":1, "no":0})

In [None]:
df_cleaned.head()

In [None]:
df_cleaned.rename(columns = {
    'sex' : 'is_female',
    'smoker' : 'is_smoker'
}, inplace = True)

In [None]:
df_cleaned.head()

In [None]:
df['region'].value_counts()

In [None]:
df_cleaned = pd.get_dummies(df_cleaned, columns = ['region'])

In [None]:

df_cleaned.head()

In [None]:
df_cleaned = df_cleaned.astype(int)

In [None]:
df_cleaned

# **Feature Engineering and Extraction**

In [None]:
sns.histplot(df_cleaned['bmi'])

In [None]:
df_cleaned['bmi_category'] = pd.cut(
    df_cleaned['bmi'],
    bins = [0,18.5,24.9,29.9,float('inf')],
    labels = ['Underweight', 'Normal', 'Overweight', 'Obese']
)

In [None]:
df_cleaned

In [None]:
df_cleaned = pd.get_dummies(df_cleaned,columns = ['bmi_category'])

In [None]:
df_cleaned

In [None]:
df_cleaned = df_cleaned.astype(int)

In [None]:
df_cleaned

#Feature Scaling

In [None]:
df_cleaned.columns #jo columns m values badi badi h th voh ml model k liye shi nhi h toh unko convert krna padga in range of other columns

In [None]:
from sklearn.preprocessing import StandardScaler
cols = ['age', 'bmi', 'children']
scaler = StandardScaler()
df_cleaned[cols] = scaler.fit_transform(df_cleaned[cols])

In [None]:
df_cleaned.head()

In [None]:
df_cleaned.columns

In [None]:
from scipy.stats import pearsonr
#Pearson Correlation
selected_features = [
    'age', 'is_female', 'bmi', 'children', 'is_smoker', 'region_northeast',
       'region_northwest', 'region_southeast',
       'region_southwest', 'bmi_category_Underweight', 'bmi_category_Normal',
       'bmi_category_Overweight', 'bmi_category_Obese'
]
correlations = {
    feature: pearsonr(df_cleaned[feature], df_cleaned['charges'])[0]
    for feature in selected_features
}
correlation_df = pd.DataFrame(list(correlations.items()), columns = ['Feature','Pearson Correlation'])
correlation_df.sort_values(by = 'Pearson Correlation', ascending = False)

In [None]:
#Chi-Square_Test
cat_features =[
    'is_female', 'is_smoker',
       'region_northeast', 'region_northwest', 'region_southeast',
       'region_southwest', 'bmi_category_Underweight', 'bmi_category_Normal',
       'bmi_category_Overweight', 'bmi_category_Obese'
]

In [None]:
from scipy.stats import chi2_contingency
import pandas as pd
alpha = 0.05
df_cleaned['charges_bin'] = pd.qcut(df_cleaned['charges'], q=4, labels = False) #qcut is used only when target is continuous
chi2_results = {}
for col in cat_features:
  contingency = pd.crosstab(df_cleaned[col], df_cleaned['charges_bin'])
  chi2_stat, p_val, _, _ = chi2_contingency(contingency)
  decision = 'Reject Null (Keep Feature)' if p_val < alpha else 'Accept Null (Drop Feature)'
  chi2_results[col] = {
      'chi2_statistic':chi2_stat,
      'p_value':p_val,
      'Decision': decision
  }
  chi2_df = pd.DataFrame(chi2_results).T
  chi2_df = chi2_df.sort_values(by='p_value')

In [None]:
chi2_df

In [None]:
final_df = df_cleaned[['age', 'is_female', 'bmi', 'is_smoker', 'charges','region_southeast','bmi_category_Obese']]

In [None]:
final_df

#Model Training

In [None]:
#29/01/2026
from sklearn.model_selection import train_test_split

In [None]:
X = final_df.drop('charges',axis=1)
y = final_df['charges']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
# test size = 0.2 means 20% is test data and 80% is train data
# we will see predictions from x_test and match with y_test and then check model accuracy

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
#30.1.2026
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test,y_pred)
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = (1-(1-r2)*(n-1)/ (n-p-1))
print(adjusted_r2, r2)