# 0.0. IMPORTS

In [1]:
import pandas as pd   
import inflection
import math
import numpy as np
import seaborn as sns
from IPython.core.display import HTML

## 0.1. Helper Functions

In [2]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()
jupyter_settings()

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


## 0.2. Loading data

In [3]:
#low_memory = False fala pra função read csv ler todo o arquivo de uma vez só para a memoria
df_insurance_raw = pd.read_csv('../Datasets/raw/train.csv',low_memory= False)

# 1.0. DATA DESCRIPTION

In [4]:
df1 = df_insurance_raw.copy()

## 1.1. Rename Columns

In [5]:
df1.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [6]:
cols_old = ['Id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response']

snakecase = lambda x: inflection.underscore(x)
cols_new = list (map(snakecase,cols_old))

#rename
df1.columns = cols_new
df1.columns

Index(['id', 'gender', 'age', 'driving_license', 'region_code',
       'previously_insured', 'vehicle_age', 'vehicle_damage', 'annual_premium',
       'policy_sales_channel', 'vintage', 'response'],
      dtype='object')

## 1.2. Data Dimensions

In [7]:
print('Number of Rows: {}'.format(df1.shape[0]))
print('Number of Cols: {}'.format(df1.shape[1]))

Number of Rows: 381109
Number of Cols: 12


## 1.3. Data Types

In [15]:
df1.dtypes

id                      string[python]
gender                  string[python]
age                              int64
driving_license         string[python]
region_code             string[python]
previously_insured      string[python]
vehicle_age             string[python]
vehicle_damage          string[python]
annual_premium                 float64
policy_sales_channel    string[python]
vintage                          int64
response                string[python]
dtype: object

In [9]:
df1.head()

Unnamed: 0,id,gender,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,vintage,response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


## 1.4. Check NA

In [10]:
df1.isna().sum()

id                      0
gender                  0
age                     0
driving_license         0
region_code             0
previously_insured      0
vehicle_age             0
vehicle_damage          0
annual_premium          0
policy_sales_channel    0
vintage                 0
response                0
dtype: int64

## 1.5. Fillout NA

## 1.6. Change Types

In [11]:
df1['id'] = df1['id'].astype('string')
df1['gender'] = df1['gender'].astype('string')
df1['driving_license'] = df1['driving_license'].astype('string')
df1['previously_insured'] = df1['previously_insured'].astype('string')
df1['vehicle_damage'] = df1['vehicle_damage'].astype('string')
df1['policy_sales_channel'] = df1['policy_sales_channel'].astype('int')
df1['policy_sales_channel'] = df1['policy_sales_channel'].astype('string')
df1['region_code'] = df1['region_code'].astype('int')
df1['region_code'] = df1['region_code'].astype('string')
df1['response'] = df1['response'].astype('string')
df1.dtypes

id                      string[python]
gender                  string[python]
age                              int64
driving_license         string[python]
region_code             string[python]
previously_insured      string[python]
vehicle_age                     object
vehicle_damage          string[python]
annual_premium                 float64
policy_sales_channel    string[python]
vintage                          int64
response                string[python]
dtype: object

### 1.6. Mapping Values

In [12]:
df1['vehicle_damage'] = df1['vehicle_damage'].map({'Yes': '1', 'No': '0'})
df1['vehicle_damage'] = df1['vehicle_damage'].astype('string')

In [13]:
df1['vehicle_age'].unique()

array(['> 2 Years', '1-2 Year', '< 1 Year'], dtype=object)

In [14]:
df1['vehicle_age'] = df1['vehicle_age'].map({'< 1 Year': '0', '1-2 Year': '1','> 2 Years': '2'})
df1['vehicle_age'] = df1['vehicle_age'].astype('string')

## 1.7. Descriptive Statistical

### 1.7.1 Numerical Attributes

### 1.7.2 Categorical Attributes