In [126]:
#Import the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [127]:
#Read the csv file
df = pd.read_csv('/content/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0.0,yes,southwest,16884.924
1,18.0,male,33.77,1.0,no,Southeast,1725.5523
2,28.0,male,33.0,3.0,no,southeast,$4449.462
3,33.0,male,22.705,0.0,no,northwest,$21984.47061
4,32.0,male,28.88,0.0,no,northwest,$3866.8552


In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1272 non-null   float64
 1   sex       1272 non-null   object 
 2   bmi       1272 non-null   float64
 3   children  1272 non-null   float64
 4   smoker    1272 non-null   object 
 5   region    1272 non-null   object 
 6   charges   1284 non-null   object 
dtypes: float64(3), object(4)
memory usage: 73.3+ KB


In [129]:
df.isnull().sum()

Unnamed: 0,0
age,66
sex,66
bmi,66
children,66
smoker,66
region,66
charges,54


In [130]:
df.shape

(1338, 7)

In [131]:
#defining the columns where mean implemenation for null values are to be done
mean_cols= ['age','bmi']

#Filling null values with the mean of the column values
for col in mean_cols:
    df[col].fillna(df[col].mean(), inplace=True)

In [132]:
#defining the categorical columns where mode implementation for null values are to be done
mode_cols= ['sex','children','smoker','region']

#Filling null values with the mode of the column values
for col in mode_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [133]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,54


In [134]:
#Dropping the rows containing null values in charges columns as this is the target value and it can effect our model if mean or mode implementation is done
df.dropna(inplace=True)

In [135]:
df.info( )

<class 'pandas.core.frame.DataFrame'>
Index: 1284 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1284 non-null   float64
 1   sex       1284 non-null   object 
 2   bmi       1284 non-null   float64
 3   children  1284 non-null   float64
 4   smoker    1284 non-null   object 
 5   region    1284 non-null   object 
 6   charges   1284 non-null   object 
dtypes: float64(3), object(4)
memory usage: 80.2+ KB


In [136]:
df['charges'] = df['charges'].str.replace('$', '').astype(float)

In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1284 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1284 non-null   float64
 1   sex       1284 non-null   object 
 2   bmi       1284 non-null   float64
 3   children  1284 non-null   float64
 4   smoker    1284 non-null   object 
 5   region    1284 non-null   object 
 6   charges   1272 non-null   float64
dtypes: float64(4), object(3)
memory usage: 80.2+ KB


In [138]:
df['region'].value_counts()

Unnamed: 0_level_0,count
region,Unnamed: 1_level_1
Southeast,206
southeast,166
Northwest,159
southwest,158
Northeast,151
northeast,151
northwest,148
Southwest,145


In [139]:
#Converting case sensitive values in region column to non case sensitive types
df['region'] = df['region'].str.lower()

In [140]:
df['region'].value_counts()

Unnamed: 0_level_0,count
region,Unnamed: 1_level_1
southeast,372
northwest,307
southwest,303
northeast,302


In [141]:
df['sex'].value_counts()

Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
male,547
female,492
M,63
woman,61
man,61
F,60


In [142]:
#Standardizing the sex column
MALE = 'male'
FEMALE = 'female'
sex_map = {'M':MALE,'man':MALE,'F':FEMALE,'female':FEMALE}
df['sex'] = df['sex'].map(sex_map)

In [143]:
df['sex'].value_counts()

Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
female,552
male,124


In [144]:
#Applying label encoding on sex column: female-1,male-0
df['sex'] = df['sex'].map({'male':0,'female':1})

In [145]:
df['sex'].value_counts()

Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
1.0,552
0.0,124


In [146]:
df['smoker'].value_counts()

Unnamed: 0_level_0,count
smoker,Unnamed: 1_level_1
no,1029
yes,255


In [147]:
#Label encoding on smoker column: no-0,yes-1
df['smoker'] = df['smoker'].map({'yes':1,'no':0})

In [148]:
df['smoker'].value_counts()

Unnamed: 0_level_0,count
smoker,Unnamed: 1_level_1
0,1029
1,255


In [149]:
#Converting negative ages to positive ones by removing the minus sign
df['age'] = df['age'].abs()

In [150]:
#Saving in a CSV file as a cleaned dataset
df.to_csv('cleaned_insurance_data.csv', index=False)