# Imports

In [1]:
import pandas as pd

# Loading

In [2]:
df = pd.read_csv('dataset.csv')

# Overview

In [3]:
df.shape

(1338, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
df.sample(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1062,59,male,41.14,1,yes,southeast,48970.2476
561,54,female,32.68,0,no,northeast,10923.9332
1278,39,male,29.925,1,yes,northeast,22462.04375
444,56,male,26.695,1,yes,northwest,26109.32905
302,56,female,37.51,2,no,southeast,12265.5069


# Missing Values

<font color='orangered'>👉 **According to the results of `info`, there are no missing values (at least, spotted as `np.nan`)**</font>

## Classic Double-Check

In [6]:
# Double-check
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

## Inspecting `object` variables

In [7]:
for col in df.select_dtypes("object").columns:
    print(f"Number of modalities for column {col}")
    print(df[col].unique())

Number of modalities for column sex
['female' 'male']
Number of modalities for column smoker
['yes' 'no']
Number of modalities for column region
['southwest' 'southeast' 'northwest' 'northeast']


<font color='orangered'>👉 **Everything seems ok...**</font>

# Duplicates

In [8]:
df.duplicated().sum()

1

In [9]:
df[df.duplicated(keep=False)]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
195,19,male,30.59,0,no,northwest,1639.5631
581,19,male,30.59,0,no,northwest,1639.5631


<font color='orangered'>👉 **Let's drop one of them.**</font>

In [10]:
df.shape

(1338, 7)

In [11]:
df = df.drop_duplicates()
df.shape

(1337, 7)

# Outliers?

In [12]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.663452,1.095737,13279.121487
std,14.044333,6.100468,1.205571,12110.359656
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29,0.0,4746.344
50%,39.0,30.4,1.0,9386.1613
75%,51.0,34.7,2.0,16657.71745
max,64.0,53.13,5.0,63770.42801


<font color='orangered'>👉 **A `bmi` of 53 seems impossible, but it needs to require to investigate the distribution further.**</font>

<font color='orangered'>👉 **A `charges` of 63770 seems suspicious too, but it also needs further exploration.**</font>

# Encoding

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   charges   1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


## Binary Encoding

In [14]:
df.sex.unique()

array(['female', 'male'], dtype=object)

In [15]:
df.smoker.unique()

array(['yes', 'no'], dtype=object)

In [16]:
sex_mapping = {'female': 0, 'male': 1}
smoker_mapping = {'no': 0, 'yes': 1}

In [17]:
df.sex = df.sex.map(sex_mapping)
df.smoker = df.smoker.map(smoker_mapping)

## One-Hot-Encoding

In [18]:
df.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [19]:
df_dummy_regions = pd.get_dummies(df.region, prefix='reg', dtype='int')
df_dummy_regions

Unnamed: 0,reg_northeast,reg_northwest,reg_southeast,reg_southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
...,...,...,...,...
1333,0,1,0,0
1334,1,0,0,0
1335,0,0,1,0
1336,0,0,0,1


# Forming New `DataFrame`

## Dropping `region` then merging

In [20]:
df_no_region = df.drop(columns=['region'])
df_no_region

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,0,27.900,0,1,16884.92400
1,18,1,33.770,1,0,1725.55230
2,28,1,33.000,3,0,4449.46200
3,33,1,22.705,0,0,21984.47061
4,32,1,28.880,0,0,3866.85520
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830
1334,18,0,31.920,0,0,2205.98080
1335,18,0,36.850,0,0,1629.83350
1336,21,0,25.800,0,0,2007.94500


In [21]:
df_cleaned = pd.concat([df_no_region, df_dummy_regions], axis=1)
df_cleaned.sample(3)

Unnamed: 0,age,sex,bmi,children,smoker,charges,reg_northeast,reg_northwest,reg_southeast,reg_southwest
469,18,0,24.09,1,0,2201.0971,0,0,1,0
1108,26,1,30.0,1,0,2904.088,0,0,0,1
435,60,1,33.11,3,0,13919.8229,0,0,1,0


## Reorganizing Columns

In [22]:
new_cols = [
    'age', 'children', 'bmi',  # Numeric variables
    'sex', 'smoker',  # Binary variables
    'reg_northeast', 'reg_northwest', 'reg_southeast', 'reg_southwest',  # One-Hot Variables
    'charges'  # Target
]
df_cleaned = df_cleaned[new_cols]
df_cleaned

Unnamed: 0,age,children,bmi,sex,smoker,reg_northeast,reg_northwest,reg_southeast,reg_southwest,charges
0,19,0,27.900,0,1,0,0,0,1,16884.92400
1,18,1,33.770,1,0,0,0,1,0,1725.55230
2,28,3,33.000,1,0,0,0,1,0,4449.46200
3,33,0,22.705,1,0,0,1,0,0,21984.47061
4,32,0,28.880,1,0,0,1,0,0,3866.85520
...,...,...,...,...,...,...,...,...,...,...
1333,50,3,30.970,1,0,0,1,0,0,10600.54830
1334,18,0,31.920,0,0,1,0,0,0,2205.98080
1335,18,0,36.850,0,0,0,0,1,0,1629.83350
1336,21,0,25.800,0,0,0,0,0,1,2007.94500


# Export

In [26]:
df_cleaned.to_csv('cleaned_dataset.csv', index=False)

In [27]:
# DOub
df2 = pd.read_csv('cleaned_dataset.csv')
df2.head()

Unnamed: 0,age,children,bmi,sex,smoker,reg_northeast,reg_northwest,reg_southeast,reg_southwest,charges
0,19,0,27.9,0,1,0,0,0,1,16884.924
1,18,1,33.77,1,0,0,0,1,0,1725.5523
2,28,3,33.0,1,0,0,0,1,0,4449.462
3,33,0,22.705,1,0,0,1,0,0,21984.47061
4,32,0,28.88,1,0,0,1,0,0,3866.8552
