# Life cycle of Machine learning Project

* Understanding the Problem Statement
* Data Collection
* Data Checks to perform
* Exploratory data analysis
* Data Pre-Processing
* Model Training
* Choose best model

In [1]:
# importing libraries and packages
import pandas as pd 
import numpy as np 

In [3]:
# load the dataset
df = pd.read_csv('data/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
# shape of the data
df.shape

(1338, 7)

### Column Summary

In [9]:
def column_summary(df):
    
    summary_df = pd.DataFrame({
        'col_name' : df.columns,
        'col_dtype' : df.dtypes.values,
        'num_of_nulls' : df.isnull().sum().values,
        'null%' : round((df.isnull().sum()/df.shape[0])*100, 2).values,
        'num_of_non_nulls' : df.count().values,
        'num_of_distinct_values' : df.nunique().values
    })
    
    ## extracting distinct_values_counts for summary_df
    distinct_values_counts = []
    for i in range(df.shape[1]):
        if summary_df['num_of_distinct_values'][i]<=10:
            distinct_values_counts.append(dict(zip(df[df.columns[i]].value_counts().index.to_list(),df[df.columns[i]].value_counts().values)))
        else:
            distinct_values_counts.append(dict(zip(df[df.columns[i]].value_counts().index.to_list()[:10],df[df.columns[i]].value_counts().values[:10])))
            
    summary_df['distinct_values_counts'] = distinct_values_counts
    
    return summary_df

summary = column_summary(df)

In [10]:
# summary dataframe
summary

Unnamed: 0,col_name,col_dtype,num_of_nulls,null%,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,age,int64,0,0.0,1338,47,"{18: 69, 19: 68, 46: 29, 52: 29, 50: 29, 47: 2..."
1,sex,object,0,0.0,1338,2,"{'male': 676, 'female': 662}"
2,bmi,float64,0,0.0,1338,548,"{32.3: 13, 28.31: 9, 30.8: 8, 31.35: 8, 30.495..."
3,children,int64,0,0.0,1338,6,"{0: 574, 1: 324, 2: 240, 3: 157, 4: 25, 5: 18}"
4,smoker,object,0,0.0,1338,2,"{'no': 1064, 'yes': 274}"
5,region,object,0,0.0,1338,4,"{'southeast': 364, 'southwest': 325, 'northwes..."
6,charges,float64,0,0.0,1338,1337,"{1639.5631: 2, 6203.90175: 1, 12981.3457: 1, 2..."


In [15]:
# duplicates
print(df.shape)

if df.duplicated().sum() > 0:
    df.drop_duplicates(inplace=True, ignore_index=True)

print(df.shape)

(1338, 7)
(1337, 7)
