# 0.0 IMPORTS, FUNCTIONS AND CONNECTIONS

## 0.1 PACKAGES AND LIBRARIES

In [1]:
import pandas as pd
import psycopg2 as pg
import seaborn as sns
import warnings
import dotenv
import os

from matplotlib import pyplot as plt
from sqlalchemy import create_engine

from IPython.core.display import HTML
from IPython.display import Image

## 0.2 Helper Function

In [2]:
warnings.filterwarnings ('ignore')

def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:90% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [3]:
jupyter_settings()

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


## 0.3 Database Connections

In [4]:
dotenv.load_dotenv(dotenv.find_dotenv())
url_sql = os.getenv('url_sql')

engine = create_engine(url_sql)

In [5]:
sql_query = '''
SELECT u.id, u.age, u.region_code, u.policy_sales_channel,
       i.previously_insured, i.annual_premium, i.vintage, i.response,
       v.driving_license, v.vehicle_age, v.vehicle_damage
       FROM pa004.users u LEFT JOIN pa004.insurance i ON (u.id = i.id)
                          LEFT JOIN pa004.vehicle v  ON (u.id = v.id)                   
                          '''

df_raw = pd.read_sql_query(sql_query, con=engine)

In [6]:
df_raw.head()

Unnamed: 0,id,age,region_code,policy_sales_channel,previously_insured,annual_premium,vintage,response,driving_license,vehicle_age,vehicle_damage
0,7,23,11.0,152.0,0,23367.0,249,0,1,< 1 Year,Yes
1,13,41,15.0,14.0,1,31409.0,221,0,1,1-2 Year,No
2,18,25,35.0,152.0,1,46622.0,299,0,1,< 1 Year,No
3,31,26,8.0,160.0,0,2630.0,136,0,1,< 1 Year,No
4,39,45,8.0,124.0,0,42297.0,264,0,1,1-2 Year,Yes


# 1.0 DESCRIÇÃO DOS DADOS

In [7]:
df1 = df_raw.copy()

## 1.1 Data Description

In [8]:
df1.dtypes

id                        int64
age                       int64
region_code             float64
policy_sales_channel    float64
previously_insured        int64
annual_premium          float64
vintage                   int64
response                  int64
driving_license           int64
vehicle_age              object
vehicle_damage           object
dtype: object

In [9]:
df1['region_code'] = df1['region_code'].astype(int64)

df1['policy_sales_channel'] = df1['policy_sales_channel'].astype(int64)

df1['vehicle_damage'] = df1['vehicle_damage'].apply(lambda x: 1 if x == 'Yes' else 0)

In [10]:
df1.head()

Unnamed: 0,id,age,region_code,policy_sales_channel,previously_insured,annual_premium,vintage,response,driving_license,vehicle_age,vehicle_damage
0,7,23,11,152,0,23367.0,249,0,1,< 1 Year,1
1,13,41,15,14,1,31409.0,221,0,1,1-2 Year,0
2,18,25,35,152,1,46622.0,299,0,1,< 1 Year,0
3,31,26,8,160,0,2630.0,136,0,1,< 1 Year,0
4,39,45,8,124,0,42297.0,264,0,1,1-2 Year,1


In [11]:
print(f'Number of Rows: {df1.shape[0]}')
print(f'Number of Rows: {df1.shape[1]}')

Number of Rows: 381109
Number of Rows: 11


In [12]:
df1.isna().sum()

id                      0
age                     0
region_code             0
policy_sales_channel    0
previously_insured      0
annual_premium          0
vintage                 0
response                0
driving_license         0
vehicle_age             0
vehicle_damage          0
dtype: int64

In [20]:
#Descriptive table

df1_num = df1.select_dtypes(include=['int64', 'float64'])



Unnamed: 0,id,age,region_code,policy_sales_channel,previously_insured,annual_premium,vintage,response,driving_license,vehicle_damage
0,1.0,20.0,0.0,1.0,0.0,2630.0,10.0,0.0,0.0,0.0
