## Import Libraries

In [None]:
import pandas as pd 
import numpy as np 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
%matplotlib inline
import pandas as pd 
from tabulate import tabulate

In [None]:
train1 = pd.read_csv('train_month_3_with_target.csv')
train2 = pd.read_csv('train_month_2.csv')
train3 = pd.read_csv('train_month_1.csv')
test1 = pd.read_csv('test_month_3.csv')
test2 = pd.read_csv('test_month_2.csv')
test3 = pd.read_csv('test_month_1.csv')

## Dataset type info

In [None]:
#train1.info()
train1.describe()
train1.isna().any()
train1.dtypes

In [None]:
pd.set_option('display.max_columns',None)


In [None]:
train1['customer_since_all'] = pd.to_datetime(train1['customer_since_all'])
train1['customer_since_bank'] = pd.to_datetime(train1['customer_since_bank'])
train1['customer_birth_date'] = pd.to_datetime(train1['customer_birth_date'])

## Continuous vs Categorical features

In [None]:
cat_vars = []
cont_vars = []
columns = list(train1.drop(columns = ['client_id']).columns)
for i in columns:
    if (len(train1.loc[:,i].unique()) >= 20):
        cont_vars.append(i)
    else:
        cat_vars.append(i)
        
print('Continious variables are:',cont_vars)
print('------------------')
print('Categorical variables are:',cat_vars)
# Here we have our list of categorical and continious variables

## Categorical features unique values

In [None]:
# To check we will print all unique values
for col in cat_vars:
    print(col,train1[col].unique())

# Colums with missing Data

In [None]:
missing_df = train1.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['%']=(missing_df['missing values'])/train1.shape[0]*100
missing_df = missing_df[missing_df['missing values'] >0].sort_values('%')
missing_df

## Data Cleaning

## Approach 1: Understand how to deal with variables with large amount of missing values

We will explore the following columns to see how can we deal with the missing values:
-    customer_education
-    customer_occupation_code
-    customer_relationship
-    customer_children



In [None]:
missing_df2 = missing_df[missing_df['missing values'] >250]
for col in missing_df2['variable']:
    print(col,train1[col].unique())

customer_education

In [None]:
sns.countplot(x='customer_education',data=train1,hue='target',palette="coolwarm_r")

### customer_occupation_code 

-   Most of the clients's job code is equal to 9.
-   From the plot above, we can't see a direct relationship between job code and current balance.\
-> customer_occupation_code feature will be dropped.

In [None]:
for x in range(0,10):
    print(f'job_code {x}', len(train1.loc[(train1['customer_occupation_code']==x)]))

In [None]:
sns.countplot(x='customer_occupation_code',data=train1,hue='target',palette="coolwarm_r")

In [None]:
plt.scatter(train1['customer_occupation_code'], train1['bal_current_account'])

### customer_relationship 

Among the non-missing data, the table below describes the proportion of customers churned if they are single or in a couple. 
We have approxiatemtly the same proportion of churners whether they are a couple or single. 

In [None]:
a = len(train1.loc[(train1['customer_relationship']=='couple') & (train1['target']== 0)])+len(train1.loc[(train1['customer_relationship']=='couple') & (train1['target']== 1)]) 

b = len(train1.loc[(train1['customer_relationship']=='single') & (train1['target']== 0)])+ len(train1.loc[(train1['customer_relationship']=='single') & (train1['target']== 1)]) 
table=[['Relationship','0','1'],
        [str(len(train1.loc[(train1['customer_relationship']=='couple')])) + ' Couple',round(len(train1.loc[(train1['customer_relationship']=='couple') & (train1['target']== 0)])/a,2 ),round(len(train1.loc[(train1['customer_relationship']=='couple') & (train1['target']== 1)])/a,2) ],
        [str(len(train1.loc[(train1['customer_relationship']=='single')])) + ' Single',round(len(train1.loc[(train1['customer_relationship']=='single') & (train1['target']== 0)])/b,2) ,round(len(train1.loc[(train1['customer_relationship']=='single') & (train1['target']== 1)])/b,2) ]]
print(tabulate(table, headers='firstrow'))

### customer_children

In [None]:
rep = {'mature':1,'no':0, 'young':1,'preschool':1,'adolescent':1,'grownup':1,'onebaby':1
 ,'yes':1}
train1['customer_children']= train1['customer_children'].replace(rep) 

In [None]:
c = len(train1.loc[(train1['customer_children']==0) & (train1['target']== 0)])+len(train1.loc[(train1['customer_children']==0) & (train1['target']== 1)]) 

d = len(train1.loc[(train1['customer_children']==1) & (train1['target']== 0)])+ len(train1.loc[(train1['customer_children']==1) & (train1['target']== 1)]) 
table=[['Children','0','1'],
        [str(len(train1.loc[(train1['customer_children']==0)])) + ' Without kids',round(len(train1.loc[(train1['customer_children']==0) & (train1['target']== 0)])/c,2 ),round(len(train1.loc[(train1['customer_children']==0) & (train1['target']== 1)])/c,2) ],
        [str(len(train1.loc[(train1['customer_children']==1)])) + ' With kids',round(len(train1.loc[(train1['customer_children']==1) & (train1['target']== 0)])/d,2) ,round(len(train1.loc[(train1['customer_children']==1) & (train1['target']== 1)])/d,2) ]]
print(tabulate(table, headers='firstrow'))

In [None]:
sns.countplot(x='customer_children',data=train1,hue='target',palette="coolwarm_r")

ppl w/o home banking active tend to churn less. is there a diff between that and has_homebanking? 
insurance 21 or 23 -> if not then don't churn, noone who has it churned? 
fixed insurance cap, very low amount of ppl have it. 
decreasing cap seems interesting. 
fire car insurance interesting.
not many ppl w personal loan, same for mortgage. 
current account seems to have effect on churn 
no churn if pension savings.
gender does seems to have a low effect on churn. 
more chrun if occupation code =9
seems like having children might have an effect but have to reformat that var. 
ppl in couple seem to churn more.

## Approach 2: Delete 4 columns 

In [None]:
train1.drop(['customer_children','customer_relationship','customer_occupation_code','customer_education',
                'client_id'],axis=1, inplace=True) 

In [None]:
train1.shape

In [None]:
train1.dropna(axis=0, inplace=True) 
train1.shape

## Handling Date Columns

 - customer_since_all
 - customer_since_bank
 - customer_birth_date

We will not strip the day from 'customer_since_all', 'customer_since_bank' because all have the same value = 01. 

In [None]:
train1['Birth_year'] = train1['customer_birth_date'].dt.strftime('%Y').astype(str).astype(int)

train1['Year_since_all'] = train1['customer_since_all'].dt.strftime('%Y').astype(str).astype(int)
train1['Month_since_all'] = train1['customer_since_all'].dt.strftime('%m').astype(str).astype(int)


train1['Year_since_bank'] = train1['customer_since_bank'].dt.strftime('%Y').astype(str).astype(int)
train1['Month_since_bank'] = train1['customer_since_bank'].dt.strftime('%m').astype(str).astype(int)


In [None]:
train1.drop(['customer_since_all','customer_since_bank','customer_birth_date'], axis=1, inplace=True)

In [None]:
train1.drop(train1[train1['Birth_year'] < 1919].index, inplace = True)

In [None]:
train1.shape

In [None]:
train1

## Dealing with customer relationship and children missing data  

In [None]:
sns.countplot(x='customer_relationship',data=train1,hue='target',palette="coolwarm_r")
sns.countplot(x='customer_children',data=train1,hue='target',palette="coolwarm_r")

## feature engineering 
- We could see if there is a significant (large) difference in say balance or savings as this could be an indicator that the client is about to churn. (can just be stored as a boolean). 
- can also compute a boolean regarding the change in some services that the client has. Say he dropped in insurrance 21 last month or smtg. also as a boolean 


## regarding modeling
the most used techniques are logistic regression, random forest, xgboost. People usually use random forest as a baseline reference and then they tune it or try other models to see if they can beat the random forest. 

## STEPS REQUIRED
## A-Identify Business Problems
- What is the likelihood of an active customer to leave the bank?
- What are the key indicators of a customer churn?
- What retention strategies can be implemented based on results to diminish prospective customer churn?




## STEPS REQUIRED
## A-Identify Business Problems
- What is the likelihood of an active customer to leave the bank?
- What are the key indicators of a customer churn?
- What retention strategies can be implemented based on results to diminish prospective customer churn?




## C- Data Evaluation
- Plot Charts and Histograms
- Analyze the distribution of categorical variables
- Analyze the churn rate by variable
- Find correlations between variables
- Making a conclusion from the above findings
- Split the dataset in training and validation sets
- Split the dependent variable from the independent variables.
- Remove Identifier: Customer ID
- Normalize Variables

## D- Model Selection
Suggested models:
- Logistic Regression
- Random Forrest
- KNN
- SVM
-xgboost

## E- Model Evalation


## F- Model Improvement
Tuning Hyperparameters

## G- Model Deployement

## Resources