### Mont Drive from Google Colab

In [0]:
from google.colab import drive
drive.mount('/content/drive')

### import libraries

In [0]:
import pandas as pd
import numpy as np

# importations functions from machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

# used from transform class model from machine learning in physical file
from sklearn.externals import joblib

# libraries from plot graphs
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as py
import plotly

# libraries from format same tables (focus in confusion matrix)
from tkinter import font
import seaborn

# libraries from configuration basic of loggers
import logging
import sys

# libraries from msg warning mensages, in this case, from ignore all
import warnings
warnings.filterwarnings('ignore')

## functions about head in pandas, this case otimized display from same lines anda width columns
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 10)

### logs info

In [0]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO, 
                     format='%(asctime)s;%(levelname)s;%(message)s',
                     datefmt='%m/%d/%Y %I:%M:%S %p')
logger = logging.getLogger('Text_process')

### Maps from directorys

In [0]:
datalake="/content/drive/My Drive/datasets/app_bank/datalake"
transient=f"{datalake}/transient"
raw=f"{datalake}/raw"
trusted=f"{datalake}/trusted"
refined=f"{datalake}/refined"
rules=f"{datalake}/rules"
model_dir = f'{datalake}/model_dir'
save_fil = f'{datalake}/save_file'

### Functions of transformation

In [0]:
# function for transformation text in features, this case i'am transform column job in features from logistic regression model
def transformVectorize(df,x,categories):
    df_features = df[f"{x}"].values
    df_categories = df[f"{categories}"].values

    vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)
    features = vectorizer.fit_transform(df_features)
    
    return features,df_categories

# i'am using logistic regression model because i prefer work with variables categories
def modelLogistcRegression(df,fet,category,filename):
    
    x_train,x_test,y_train,y_test = train_test_split(fet,category,test_size=0.3, random_state=42)

    classifer = LogisticRegression(n_jobs=1,class_weight='balanced', solver='lbfgs',max_iter=30,multi_class='ovr')
    model = classifer.fit(x_train,y_train)
    
    #Save model
    file_model = f'{model_dir}/model_pip_app_bk_{filename}.joblib' 
    joblib.dump(model, file_model)    
    
    return x_train,x_test,y_train,y_test,model

# recovery file model from logistic regression because is more fast to read
def recoveryLogistcRegression(filename):

  file_model = f'{model_dir}/model_pip_app_bk_{filename}.joblib'  
  model_loader = joblib.load(file_model)

  return model_loader

# confusion matrix for avaliable the precision of model
def confusionMatrix(x,y,model):
  conf_mat = confusion_matrix(x, y)
  fig, ax = plt.subplots(figsize=(6,6))
  seaborn.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d',
              xticklabels=model.classes_, 
              yticklabels=model.classes_)
  plt.ylabel('Actual')
  plt.xlabel('Predicted')
  plt.title("CONFUSION MATRIX - Logistic Regression \n", size=12);

### Load from CSV dataset

In [0]:
df = pd.read_csv(f'{transient}/bank-full.csv',sep=';')
df.head()

In [0]:
df.info()

In [0]:
df_jobs = df.groupby(['job']).count()
df_jobs

In [0]:
print(df.job.value_counts())

#### 1) Qual profissão tem mais tendência a fazer um empréstimo? De qual tipo?

In [0]:
# select distinct values from column job
print(df.job.nunique())

### Creating model from Machine Learning Logistic Regression

##### transform jobs in features from model ML

In [0]:
features,category = transformVectorize(df,'job','loan')
features.shape
features[:3].toarray()

##### Training model used loan from categories and jobs in features (recovery modelFile)

##### the precision in 'yes' categorie is down but recall is up, more details we see in confusion matrix

In [0]:
x_train,x_test,y_train,y_test,model = modelLogistcRegression(df,features,category,'supervisioned')
model_loader = recoveryLogistcRegression('supervisioned')
a_pred = model_loader.predict(x_test)
print(classification_report(y_test,a_pred))

##### Avaliable model using Confusion Matrix

##### Avaliable the model, we see the model predict more incorrecty because the false positive the number is very large, but the false negative is more down.. we needs to calibrated model from predictions

In [0]:
confusionMatrix(y_test,a_pred,model)

### Prediction in dataset (real world)

#### Using 13k predictions of model training and apply in dataset

In [0]:
pred = np.array(a_pred)
print(len(pred))

In [0]:
df_novo = df.head(len(pred))
df_novo['predicao'] = a_pred

##### display from predictions in dataset

In [0]:
df_novo.head(200)

##### Counting jobs predictions distinct datasets because i preferer to visualize results separeted

In [0]:
df_sim = df_novo.where(df_novo['predicao']=='yes').groupby(['job'])['job'].count().sort_values(ascending=False)
df_sim.head(200)

In [0]:
df_sim = df_novo.where(df_novo['predicao']=='no').groupby(['job'])['job'].count().sort_values(ascending=False)
df_sim.head(200)

##### Percent predictions loan

In [0]:
df_n = df_novo.groupby(['predicao'])['predicao'].count()
df_n = df_n.transform(lambda x: x/sum(x))
df_n.head()

##### Percent preditions details job, this case blue-collar have more probabilities in adhere to a new credit in bank

In [0]:
df_m = df_novo.groupby(['job'])['job'].count()
df_m.transform(lambda x: x/sum(x)).sort_values(ascending=False).head()

#### 2) Fazendo uma relação entre número de contatos e sucesso da campanha quais são os pontos relevantes a serem observados?

#### Answers

##### We see the factor of correlation and using helmap we observed the acording color the previous before campaign there more influence in result

In [0]:
df.loc[(df['poutcome']=='success')]\
.corr(method='spearman').style.format('{:.2}').background_gradient(cmap=plt.get_cmap('coolwarm'),axis=1)

In [0]:
df['campaign'].loc[(df['poutcome']=='success')].corr(df['previous'],method='spearman')

#### 3) Baseando-se nos resultados de adesão desta campanha qual o número médio e o máximo de ligações que você indica para otimizar a adesão ?

#### Answer

##### we see the acording default metrics, the poutcome success and 'outher' demostrated potencial in maximizated accession because the numbers min and max is not large (mean = 1.8 success, other = 2.46 and min 1.0  for both and max between 11.0 and 16.0)

In [0]:
df.groupby(['poutcome'])['campaign'].describe().head()

#### 4) O resultado da campanha anterior tem relevância na campanha atual?

#### Answer

#### Result positive because the acording helpmap we observed to the focus in previous and pdays can modify the actual scenarie

In [0]:
df.corr(method='spearman').style.format('{:.2}').background_gradient(cmap=plt.get_cmap('coolwarm'),axis=1)

In [0]:
df['previous'].corr(df['pdays'])

#### 5) Qual o fator determinante para que o banco exija um seguro de crédito?

#### Answer

##### We observed to between age and balance have more weight the factor predominant and no significant changes occur regarding the campaign

In [0]:
df.corr(method='pearson').style.format('{:.2}').background_gradient(cmap=plt.get_cmap('coolwarm'),axis=1)

In [0]:
df.loc[(df['housing']=='no') & (df['loan']=='no')]\
.corr(method='pearson').style.format('{:.2}').background_gradient(cmap=plt.get_cmap('coolwarm'),axis=1)

In [0]:
df.loc[(df['housing']=='yes') & (df['loan']=='yes')]\
.corr(method='pearson').style.format('{:.2}').background_gradient(cmap=plt.get_cmap('coolwarm'),axis=1)

#### 6) Quais são as características mais proeminentes de um cliente que possua empréstimo imobiliário?

#### Answer

##### the acording helpmap we observed to the factory age and balance have more relevace

In [0]:
df_old = df[(df['age'] > 40)]
df_old.head()

In [0]:
df_young = df[(df['age'] < 40)]
df_young.head()

In [0]:
df_old[['age','balance']].groupby(['age','balance'])['balance'].sum()

In [0]:
df.corr(method='pearson').style.format('{:.2}').background_gradient(cmap=plt.get_cmap('coolwarm'),axis=1)

In [0]:
df.loc[(df['age']>=40)]\
.corr(method='pearson').style.format('{:.2}').background_gradient(cmap=plt.get_cmap('coolwarm'),axis=1)

In [0]:
df.loc[(df['age']<40)]\
.corr(method='pearson').style.format('{:.2}').background_gradient(cmap=plt.get_cmap('coolwarm'),axis=1)