In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
dataset = pd.read_csv(r'medical_costs.csv')

In [3]:
dataset

Unnamed: 0,Age,Sex,BMI,Children,Smoker,Region,Medical Cost
0,58,male,15.6,2,yes,northwest,17907.54
1,24,male,29.8,0,yes,northeast,16312.64
2,50,male,29.0,5,no,northwest,6819.21
3,35,male,34.0,1,no,southeast,5247.87
4,31,female,17.6,3,yes,southeast,17525.49
...,...,...,...,...,...,...,...
9995,24,female,26.9,2,yes,southeast,16551.53
9996,49,female,33.4,3,no,northeast,6376.05
9997,52,female,38.1,5,yes,northwest,18760.27
9998,24,male,33.4,4,no,northwest,5471.91


# Understanding the dataset

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           10000 non-null  int64  
 1   Sex           10000 non-null  object 
 2   BMI           10000 non-null  float64
 3   Children      10000 non-null  int64  
 4   Smoker        10000 non-null  object 
 5   Region        10000 non-null  object 
 6   Medical Cost  10000 non-null  float64
dtypes: float64(2), int64(2), object(3)
memory usage: 547.0+ KB


In [5]:
dataset.isna().sum()

Age             0
Sex             0
BMI             0
Children        0
Smoker          0
Region          0
Medical Cost    0
dtype: int64

In [6]:
dataset['Sex'].unique()

array(['male', 'female'], dtype=object)

In [7]:
dataset['Smoker'].unique()

array(['yes', 'no'], dtype=object)

In [8]:
dataset['Region'].unique()

array(['northwest', 'northeast', 'southeast', 'southwest'], dtype=object)

In [9]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,10000.0,41.6784,13.807724,18.0,30.0,42.0,54.0,65.0
BMI,10000.0,27.40301,7.22896,15.0,21.1,27.4,33.7,40.0
Children,10000.0,2.5017,1.701672,0.0,1.0,2.0,4.0,5.0
Medical Cost,10000.0,11898.932216,6073.875834,3617.09,5909.925,7957.43,17931.9625,20268.21


# LabelEncoder

In [10]:
from sklearn.preprocessing import LabelEncoder
encoded_sex = LabelEncoder()
dataset['Encoded_Sex'] = encoded_sex.fit_transform(dataset['Sex'])
dataset

Unnamed: 0,Age,Sex,BMI,Children,Smoker,Region,Medical Cost,Encoded_Sex
0,58,male,15.6,2,yes,northwest,17907.54,1
1,24,male,29.8,0,yes,northeast,16312.64,1
2,50,male,29.0,5,no,northwest,6819.21,1
3,35,male,34.0,1,no,southeast,5247.87,1
4,31,female,17.6,3,yes,southeast,17525.49,0
...,...,...,...,...,...,...,...,...
9995,24,female,26.9,2,yes,southeast,16551.53,0
9996,49,female,33.4,3,no,northeast,6376.05,0
9997,52,female,38.1,5,yes,northwest,18760.27,0
9998,24,male,33.4,4,no,northwest,5471.91,1


We Encoded Sex Parameter by using LabelEncoder()
Here, 
+ 1 is for male
+ 0 is for female

In [11]:
from sklearn.preprocessing import LabelEncoder
encoded_smoker = LabelEncoder()
encoded_region = LabelEncoder()
dataset['Encoded_Smoker'] = encoded_smoker.fit_transform(dataset['Smoker'])
dataset['Encoded_Region'] = encoded_region.fit_transform(dataset['Region'])
dataset

Unnamed: 0,Age,Sex,BMI,Children,Smoker,Region,Medical Cost,Encoded_Sex,Encoded_Smoker,Encoded_Region
0,58,male,15.6,2,yes,northwest,17907.54,1,1,1
1,24,male,29.8,0,yes,northeast,16312.64,1,1,0
2,50,male,29.0,5,no,northwest,6819.21,1,0,1
3,35,male,34.0,1,no,southeast,5247.87,1,0,2
4,31,female,17.6,3,yes,southeast,17525.49,0,1,2
...,...,...,...,...,...,...,...,...,...,...
9995,24,female,26.9,2,yes,southeast,16551.53,0,1,2
9996,49,female,33.4,3,no,northeast,6376.05,0,0,0
9997,52,female,38.1,5,yes,northwest,18760.27,0,1,1
9998,24,male,33.4,4,no,northwest,5471.91,1,0,1


We Encoded Smoker and Region Parameter by using LabelEncoder()
Here Encoded_Smoker is ,
+ 1 is for yes
+ 0 is for no

And Encoded_Region is ,
+ 1 is for northwest
+ 2 is for southeast
+ 3 is for southwest
+ 4 is for northeast

# Pickling encoder model

In [12]:
#import pickle as pkl
#sex_model = open('sex_encoder.pkl' , 'wb')
#smoker_model = open('smoker_encoder.pkl' , 'wb')
#region_model = open('region_encoder.pkl' , 'wb')
#pkl.dump( encoded_sex , sex_model )
#pkl.dump( encoded_smoker , smoker_model)
#pkl.dump( encoded_region , region_model)

# Saving the PreProcessed dataset

In [13]:
dataset = dataset.drop(['Sex' , 'Smoker' , 'Region'] , axis = 1)

In [14]:
dataset

Unnamed: 0,Age,BMI,Children,Medical Cost,Encoded_Sex,Encoded_Smoker,Encoded_Region
0,58,15.6,2,17907.54,1,1,1
1,24,29.8,0,16312.64,1,1,0
2,50,29.0,5,6819.21,1,0,1
3,35,34.0,1,5247.87,1,0,2
4,31,17.6,3,17525.49,0,1,2
...,...,...,...,...,...,...,...
9995,24,26.9,2,16551.53,0,1,2
9996,49,33.4,3,6376.05,0,0,0
9997,52,38.1,5,18760.27,0,1,1
9998,24,33.4,4,5471.91,1,0,1


In [15]:
rename = { 'Encoded_Sex' : 'Sex','Encoded_Smoker' : 'Smoker','Encoded_Region' : 'Region'}
dataset.rename(columns=rename , inplace=True)
dataset

Unnamed: 0,Age,BMI,Children,Medical Cost,Sex,Smoker,Region
0,58,15.6,2,17907.54,1,1,1
1,24,29.8,0,16312.64,1,1,0
2,50,29.0,5,6819.21,1,0,1
3,35,34.0,1,5247.87,1,0,2
4,31,17.6,3,17525.49,0,1,2
...,...,...,...,...,...,...,...
9995,24,26.9,2,16551.53,0,1,2
9996,49,33.4,3,6376.05,0,0,0
9997,52,38.1,5,18760.27,0,1,1
9998,24,33.4,4,5471.91,1,0,1


In [16]:
dataset.columns

Index(['Age', 'BMI', 'Children', 'Medical Cost', 'Sex', 'Smoker', 'Region'], dtype='object')

In [17]:
dataset = dataset[['Age', 'BMI', 'Children', 'Sex', 'Smoker', 'Region', 'Medical Cost']]
dataset

Unnamed: 0,Age,BMI,Children,Sex,Smoker,Region,Medical Cost
0,58,15.6,2,1,1,1,17907.54
1,24,29.8,0,1,1,0,16312.64
2,50,29.0,5,1,0,1,6819.21
3,35,34.0,1,1,0,2,5247.87
4,31,17.6,3,0,1,2,17525.49
...,...,...,...,...,...,...,...
9995,24,26.9,2,0,1,2,16551.53
9996,49,33.4,3,0,0,0,6376.05
9997,52,38.1,5,0,1,1,18760.27
9998,24,33.4,4,1,0,1,5471.91


In [18]:
#dataset.to_csv('dataset.csv')