# 00 - Preliminary 

In [None]:
cd ..

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os

from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.preprocessing import *
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

from sklearn.compose import * 
from sklearn.preprocessing import *
from sklearn.feature_extraction import * 
from sklearn.feature_selection import *

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as Pipeline


import plotly.express as px


In [None]:
data = pd.read_csv('data/data.csv')
data

In [None]:
data = data.iloc[:, :20]
data

In [None]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),

    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, make_column_selector(dtype_include=np.number)),
        ("cat", categorical_transformer, make_column_selector(dtype_exclude =np.number)),
    ]
)

In [None]:
X = data.drop(columns=['TARGET'])
y = data['TARGET']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
X_train

In [None]:
X_test

In [None]:
pipeline = Pipeline(
    [
        ('sampler', RandomUnderSampler()),
        ("transformer", preprocessor),
        ('imputer', SimpleImputer(strategy='median')),
        # ('scaler', StandardScaler()),
        ('estimator', LogisticRegression()),    
    ]
)

In [None]:
grid = GridSearchCV(pipeline, param_grid={}, cv=5, scoring='f1', n_jobs=-1, return_train_score=True, verbose=1)
grid.fit(X_train, y_train)

In [None]:
def resultize(grid):

    res  = pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score').head(5)
    columns = [k for k in res.columns  if "split" not in k ]
    res = res.loc[:, columns].round(2)
    return res

In [None]:
resultize(grid)

## 01 First approch

In [None]:
def calcul_pnl_one_easy(montant, y_true, y_pred, taux=0.04, default_rate=1):

    if y_pred :
        return 0
    
    if y_true : 
        return -default_rate * montant

    return montant * taux

In [None]:
calcul_pnl_one_easy(100, 0, 0)

In [None]:
calcul_pnl_one_easy(100, 0, 1)

In [None]:
calcul_pnl_one_easy(100, 1, 0)

In [None]:
calcul_pnl_one_easy(100, 1, 1)

In [None]:
y_pred_list = grid.predict(X_test)
y_pred_list 

In [None]:
y_true_list = y_test.values
y_true_list

In [None]:
montant_list = X_test.loc[:, 'AMT_CREDIT'].values
montant_list

In [None]:
l1 = ["a", "b"]
l2 = [0 ,1]

In [None]:
v = 0
for m, y_true, y_pred in zip(montant_list, y_true_list, y_pred_list):
    v += calcul_pnl_one_easy(m, y_true=y_true, y_pred=y_pred)
round(v/1_000_000) 

In [None]:
sum(montant_list/1_000_000)

## 02 predict proba

In [None]:
y_pred_proba_list = grid.predict_proba(X_test)
y_pred_proba_list = list(y_pred_proba_list[ :, 0])
y_pred_proba_list = [round(x, 4) for x in y_pred_proba_list]
y_pred_proba_list[:10]

In [None]:
def calcul_pnl_one_complexe(montant, 
                            y_true, 
                            y_pred_proba, 
                            taux=0.04, 
                            default_rate=1, 
                            threshold=0.5, ):


    y_pred = 0 if y_pred_proba > threshold else 1

    if y_pred : 
        return 0
    
    if y_true :
        return -default_rate* montant

    return montant * taux

In [None]:
v = 0
for m, y_true, y_pred_proba in zip(montant_list, y_true_list, y_pred_proba_list):
    v += calcul_pnl_one_complexe(m, y_true=y_true, y_pred_proba=y_pred_proba)
round(v/1_000_000) 

In [None]:
v = 0
for m, y_true, y_pred_proba in zip(montant_list, y_true_list, y_pred_proba_list):
    v += calcul_pnl_one_complexe(m, y_true=y_true, y_pred_proba=y_pred_proba,threshold=0.0)
round(v/1_000_000) 

In [None]:
v = 0
for m, y_true, y_pred_proba in zip(montant_list, y_true_list, y_pred_proba_list):
    v += calcul_pnl_one_complexe(m, y_true=y_true, y_pred_proba=y_pred_proba,threshold=1)
round(v/1_000_000) 

In [None]:
threshold_list = np.linspace(0, 1, 100)
pnl_list = []

taux = 0.04
default_rate = 1 
for threshold in threshold_list:
    v = 0
    for m, y_true, y_pred_proba in zip(montant_list, y_true_list, y_pred_proba_list):
        v += calcul_pnl_one_complexe(m, y_true=y_true, y_pred_proba=y_pred_proba,threshold=threshold, default_rate=default_rate, taux=taux  )
    pnl_list.append(v)

In [None]:
plt.plot(threshold_list, pnl_list)

In [None]:


px.line(x=threshold_list, y=pnl_list)

In [None]:
pnl_df = pd.DataFrame({'threshold': threshold_list, 'pnl': pnl_list}).sort_values(by='pnl', ascending=False)
pnl_df

In [None]:
pnl_df.sort_values(by='pnl', ascending=False).head(10)

In [None]:
def analyse_pnl(taux = 0.04, default_rate = 1 ):


    threshold_list = np.linspace(0, 1, 100)
    pnl_list = []

    for threshold in threshold_list:
        v = 0
        for m, y_true, y_pred_proba in zip(montant_list, y_true_list, y_pred_proba_list):
            v += calcul_pnl_one_complexe(m, y_true=y_true, y_pred_proba=y_pred_proba,threshold=threshold, default_rate=default_rate, taux=taux  )
        pnl_list.append(v)

    fig = px.line(x=threshold_list, y=pnl_list)
    fig.show()

In [None]:
analyse_pnl(taux=0.04, default_rate=1)

In [None]:
analyse_pnl(0.04, 0.25)

In [None]:
# pert que 25% de la somme
# taux interet = 0.03
analyse_pnl(0.025, 0.25)

In [None]:
# pert que 25% de la somme
# taux interet = 0.03
analyse_pnl(0.03, 0.25)

## 03 With Good price

In [None]:
def calcul_pnl_one_goods(   montant, 
                            good_price,
                            y_true, 
                            y_pred_proba, 
                            taux=0.028, 
                            discount_on_goods=0.30,
                            threshold=0.5,
                            cost_evaluation=100,):
    """ """


    y_pred = 0 if y_pred_proba > threshold else 1

    if y_pred : 
        return - cost_evaluation
    
    if y_true :
        value = (good_price * (1-discount_on_goods)) -  montant
        return  value if value < 0 else 0

    return (montant * taux) - cost_evaluation

In [None]:
data.columns

In [None]:
goods_list = X_test.loc[:, 'AMT_GOODS_PRICE'].values
goods_list[:10]

In [None]:
threshold_list = np.linspace(0, 1, 100)
pnl_list = []

for threshold in threshold_list:
    v = 0
    for m, g, y_true, y_pred_proba in zip(montant_list, goods_list, y_true_list, y_pred_proba_list):
        v += calcul_pnl_one_goods(m, g, y_true=y_true, y_pred_proba=y_pred_proba,threshold=threshold)
    pnl_list.append(v)

In [None]:
import plotly.express as px


px.line(x=threshold_list, y=pnl_list)