# Machine Learning Engineer Nanodegree
## Projeto Final
## Projeto: Melhorando a retenção de clientes na indústria de seguros

O projeto “Melhorando a retenção de clientes na indústria de seguros” tem como objetivo analisar os dados históricos da carteira de clientes de uma seguradora a fim de encontrar padrões de comportamentos nos clientes que não renovaram suas apólices, com isto criar um modelo preditivo que aplicado as apólices que estão vigentes hoje, retornem a probabilidade de determinado cliente não renovar sua apólice ao final do contrato.

Isto possibilitaria a seguradora manter uma régua de comunicação e interação diferenciada com os clientes com alta probabilidade de não renovar. Com esse cliente sentindo-se **“Único”** e isto possui um peso na decisão do cliente, espera-se uma melhorar no Índice de renovação das apólices.



## Setup

In [None]:
# Setup warnings
import warnings
#warnings.filterwarnings("ignore")

# Setup Logging 
import logging
import datetime
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("Logging setup!")

# Setup Variables 
var_train_start = '02/2017'
var_train_end   = '10/2017'
var_valid       = '11/2017'
var_pred        = '01/2018'

print("Running the model with the following configurations:")
print(datetime.datetime.today())
print("Start training: ",var_train_start)
print("End training  : ",var_train_end)
print("Validation    : ",var_valid)
print("Prediction    : ",var_pred)

In [None]:
# Importing libraries
logger.info("Start loading Libraries")
import numpy as np
import pandas as pd
from time import time
from IPython.display import display
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder,MinMaxScaler

from plotly import figure_factory as FF
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot
from plotly.graph_objs import Scatter, Figure, Layout
from sklearn.metrics import confusion_matrix

import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from pandasql import sqldf
import xgboost as xgb
import qgrid

%matplotlib inline

logger.info("Finish loading Libraries")

In [None]:
# Reading data
logger.info("Start reading Libraries")
insurance_data = pd.read_csv("../input/dataset_insurance.csv",sep=';')
logger.debug("insurance_data: {}",insurance_data)

logger.info("Finish reading Libraries")

## Exploração dos dados

In [None]:
init_notebook_mode(True)

In [None]:
# Data analysis
logger.info("Start data exploration")

#Data analysis
qgrid_widget = qgrid.show_grid(insurance_data,show_toolbar = True)
qgrid_widget

In [None]:
#Dataset description
insurance_data.describe()

In [None]:
#Dtypes
print("insurance_data data types: ")
print(insurance_data.dtypes)


In [None]:
#Frequency outcome
print("insurance_data 'Resultado' frequency: ")
print(insurance_data['Resultado'].value_counts())

logger.info("Finish data exploration")


## Visualização exploratória

In [None]:
logger.info("Start visual exploration")
#Preparing data for plotting

renovou_vis = sqldf("SELECT AnoMes, count(*) as IdCount_r  \
             FROM insurance_data \
             WHERE Resultado = 'Renovou' \
             group by AnoMes")

Nrenovou_vis = sqldf("SELECT AnoMes, count(*) as IdCount_Nr  \
             FROM insurance_data \
             WHERE Resultado = 'NaoRenovou' \
             group by AnoMes")

insurance_data_vis = (sqldf("SELECT a.AnoMes, (cast(IdCount_r as float)/(cast(IdCount_Nr as float) + cast(IdCount_r as float))) as IndiceRenovacao  \
             FROM renovou_vis a \
             INNER JOIN Nrenovou_vis b on a.AnoMes = b.AnoMes \
             WHERE a.AnoMes <> '01/2018'"
           ))



In [None]:


from plotly.graph_objs import *


x = list(insurance_data_vis['AnoMes'].apply(str))# list(['01/2017', '02/2017', '03/2017', '04/2017', '05/2017', '06/2017', '07/2017', '08/2017', '09/2017', '10/2017', '11/2017', '12/2017', '01/2018'])
y = list(insurance_data_vis.IndiceRenovacao)
df = pd.DataFrame({'x': x, 'y': y})

import plotly.graph_objs as go

data = [
    go.Scatter(
        x=df['x'], # assign x as the dataframe column 'x'
        y=df['y']
    )
]

layout = go.Layout(
    title='Índice Renovação 2017',
    yaxis=(dict(title='Índice de Renovação (%)', range=[0,1])),   
    xaxis=dict(title='Mes/Ano Vencimento Apólice')
)

fig = go.Figure(data=data, layout=layout)

# IPython notebook
# py.iplot(fig, filename='pandas/line-plot-title')

url = iplot(fig, filename='pandas/line-plot-title')

In [None]:
#Density plot
insurance_data_vis = insurance_data[insurance_data.Resultado == 'NaoRenovou']
insurance_data_vis = insurance_data_vis[['SaudeFinancCli','ExpSinistroCli','ExpSinistroCorr','IndFechCorr','ExpAss24','ExpCallCenterCli']]
ax = insurance_data_vis.plot.density(title="Features Density [NaoRenovou]")
ax.set(xlabel="Values", ylabel="Density")

In [None]:
#Density plot
insurance_data_vis = insurance_data[insurance_data.Resultado == 'Renovou']
insurance_data_vis = insurance_data_vis[['SaudeFinancCli','ExpSinistroCli','ExpSinistroCorr','IndFechCorr','ExpAss24','ExpCallCenterCli']]
ax = insurance_data_vis.plot.density(title="Features Density [Renovou]")
ax.set(xlabel="Values", ylabel="Density")


In [None]:
# Pair PLot
insurance_data_vis = insurance_data[['Resultado','SaudeFinancCli','ExpSinistroCli','ExpSinistroCorr','IndFechCorr','ExpAss24','ExpCallCenterCli']]
g = sns.pairplot(insurance_data_vis, hue="Resultado", palette="husl")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Cross values - [Renovou/NaoRenovou]', fontsize=16)


In [None]:
#Features correlation
g = sns.FacetGrid(insurance_data, col='Resultado')
g.map_dataframe(lambda data, color: sns.heatmap(insurance_data.corr(), linewidths=0, cmap="YlGnBu"))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Correlation plot - [Renovou/NaoRenovou]', fontsize=16)
g.fig.set_size_inches(11.7, 8.27)
logger.info("Finish visual exploration")

## Pré-processamento de dados

In [None]:
logger.info("Start data processing")
#Function to label Encoder and Normalization
def dummyEncode(df):
        columnsToEncode = df.select_dtypes(include=['category', 'object'])
        columnsNumeric = df.select_dtypes(include=['int64', 'float64'])

        le = LabelEncoder()
        
        for feature in columnsToEncode:
            if (feature != 'AnoMes' and feature != 'Resultado'): 
                try:
                    df[feature] = le.fit_transform(df[feature])
                    #df[feature] = scaler.fit_transform(df[[feature]]
                except:
                    print('Fail to encode: ' + feature)
        return df
    
def MinMaxScale(df):
        columnsNumeric = df.select_dtypes(include=['int64', 'float64'])
        for feature in columnsNumeric:
            if (feature != 'Resultado'):
                try:
                    scaler = MinMaxScaler()
                    df[feature] = scaler.fit_transform(df[[feature]].apply(lambda x: round(x,3)))
                except:
                    print('Fail in Normalization '+ feature)  
        return df

In [None]:
# Savind unique ID of prediction
pred_IdConta = insurance_data[(insurance_data.AnoMes == var_pred)]['IdConta']
logger.debug("Unique ID of predictions: {}", pred_IdConta)

# Prediction original dataset - whitout transformation
output_pred = insurance_data[(insurance_data.AnoMes == var_pred)]

# Dropping Columns
insurance_data['Resultado'] = insurance_data[['Resultado']].replace(['NaoRenovou', 'Renovou'], [0, 1]).fillna(0.0).astype(int)
#insurance_data['Resultado'] = insurance_data['Resultado'].fillna(0.0).astype(int)

insurance_data = pd.DataFrame(insurance_data.drop(['IdConta','Produto'],axis = 1))

# Label Encoding
insurance_data = pd.DataFrame(dummyEncode(insurance_data))

# Normalization
insurance_data = pd.DataFrame(MinMaxScale(insurance_data))


# Data Split
train = insurance_data[(insurance_data.AnoMes >= var_train_start) & (insurance_data.AnoMes <= var_train_end)]
valid = insurance_data[(insurance_data.AnoMes == var_valid)]
ori_pred = insurance_data[(insurance_data.AnoMes == var_pred)]

x_train = train.drop(['Resultado','AnoMes'],axis = 1)
y_train = train[['Resultado']]

x_valid = valid.drop(['Resultado','AnoMes'],axis = 1)
y_valid = valid[['Resultado']]

x_ori_pred = ori_pred.drop(['Resultado','AnoMes'],axis = 1)

# Sparse Matrix
d_train = xgb.DMatrix(x_train, y_train)
d_valid = xgb.DMatrix(x_valid, y_valid)
d_test = xgb.DMatrix(x_ori_pred)

logger.info("Finish data processing")

## Treinamento

In [None]:
logger.info("Start data training")
# Parameters
xgb_params = {'eta': 0.3
             ,'gamma': 0
             ,'min_child_weight':1
             ,'max_delta_step':0
             ,'subsample':1
             ,'colsample_bytree ':1
             ,'colsample_bylevel':1
             ,'lambda': 1
             ,'alpha':1
             ,'scale_pos_weight':1         
             ,'max_depth': 6
             ,'objective': 'binary:logistic'
             ,'eval_metric': 'logloss'
             ,'seed': 99
             ,'silent': True}      
# Model trainig
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
model = xgb.train(xgb_params 
                  ,d_train, 1000
                  ,watchlist
                  ,maximize=False
                  ,verbose_eval=50
                  ,early_stopping_rounds=10)
logger.info("Finish data training")

## Validação

In [None]:
logger.info("Start data validation")
model_pred = model.predict(d_valid)
cm=confusion_matrix(y_valid, model_pred.round()) 
ax= plt.subplot()
sns.heatmap(pd.DataFrame(cm), linewidths=0, cmap="YlGnBu",annot=True,ax = ax,fmt='g')
# labels, title and ticks
ax.figsize=(15,8)## Validação
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['NaoRenova', 'Renova']); ax.yaxis.set_ticklabels(['NaoRenova', 'Renova']);

In [None]:
# display stats metrics
from pandas_ml import ConfusionMatrix
cm = ConfusionMatrix(list(y_valid['Resultado']), list(model_pred.round()))
print("Class statistics: ")
cm.print_stats()

In [None]:
logger.info("Finish data validation")

## Predição

In [None]:
logger.info("Start prediction")
#Savind the prediction output

output_pred['prediction%'] = model.predict(d_test)
output_pred['prediction']  = model.predict(d_test).round() 
display(output_pred.head())

try:
    output_pred.to_csv("../output/" + 'output_predicted_' + var_pred.replace("/","") +'.csv', sep='|', encoding='utf-8',index=False)
    print("File saved!")
except:
    print("Failed to save the output")    
    
logger.info("Finish prediction")

## Feature importance

In [None]:
#Plotting feature importance
xgb.plot_importance(model)