In [3]:
import pandas as pd

import numpy as np

from imblearn.under_sampling import RandomUnderSampler

from pycaret import classification


# MODELISATION

## Avec les données imputées par la stratégie de la médiane et du mode

In [11]:
df = pd.read_csv('final_data_cleanned.csv')

In [12]:
df.shape

(307495, 15)

In [13]:
df = df.drop("Unnamed: 0", 1)

In [14]:
df.head(2)

Unnamed: 0,TARGET,CODE_GENDER,CNT_CHILDREN,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,AMT_CREDIT,DAYS_BIRTH,AMT_ANNUITY,class_amt_tot,class_amt_credit,class_amt_ANNUITY,class_age,CNT_CHILDREN_class
0,1,M,0,N,Y,202500.0,406597.5,9461,24700.5,meduim,low,low,jeune,0
1,0,F,0,N,N,270000.0,1293502.5,16765,35698.5,meduim,meduim,meduim,adulte,0


In [15]:
df_for_model = df[["CODE_GENDER", "CNT_CHILDREN_class", "FLAG_OWN_CAR", "FLAG_OWN_REALTY",
                   "class_amt_tot", "class_amt_credit", "class_amt_ANNUITY", "class_age", "TARGET"]]


## EQUILIBRAGE DES CLASSES

In [16]:
df_for_model.TARGET.value_counts()

0    282670
1     24825
Name: TARGET, dtype: int64

In [17]:
282670//24825

11

##### On a la classe 0 qui fait 11 fois la 1, d'où la pertinence de l'équilibrage des classes

In [18]:
features, target = df_for_model.drop(columns=['TARGET']), df_for_model.TARGET

In [19]:
features[:2]

Unnamed: 0,CODE_GENDER,CNT_CHILDREN_class,FLAG_OWN_CAR,FLAG_OWN_REALTY,class_amt_tot,class_amt_credit,class_amt_ANNUITY,class_age
0,M,0,N,Y,meduim,low,low,jeune
1,F,0,N,N,meduim,meduim,meduim,adulte


In [20]:
# define undersample strategy
# undersample = RandomUnderSampler(sampling_strategy='majority')
undersample = RandomUnderSampler(sampling_strategy=.25)

# fit and apply the transform
X_under, y_under = undersample.fit_resample(df_for_model.drop(columns=["TARGET"]), df_for_model.TARGET)

In [21]:
df_for_model.shape, X_under.shape

((307495, 9), (124125, 8))

In [22]:
y_under.value_counts()

0    99300
1    24825
Name: TARGET, dtype: int64

## Création, entrainement et optimisation de modèle

In [95]:
s = classification.setup(pd.concat([X_under, y_under], axis=1), target = 'TARGET')

Unnamed: 0,Description,Value
0,session_id,2611
1,Target,TARGET
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(124125, 9)"
5,Missing Values,False
6,Numeric Features,0
7,Categorical Features,8
8,Ordinal Features,False
9,High Cardinality Features,False


In [96]:
classification.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7996,0.5857,0.0,0.0,0.0,0.0,0.0,0.062
svm,SVM - Linear Kernel,0.7996,0.0,0.0,0.0,0.0,0.0,0.0,0.034
ridge,Ridge Classifier,0.7996,0.0,0.0,0.0,0.0,0.0,0.0,0.021
ada,Ada Boost Classifier,0.7996,0.5857,0.0,0.0,0.0,0.0,0.0,0.193
gbc,Gradient Boosting Classifier,0.7996,0.5902,0.0002,0.15,0.0003,0.0002,0.0029,0.33
lda,Linear Discriminant Analysis,0.7996,0.5857,0.0,0.0,0.0,0.0,0.0,0.029
lightgbm,Light Gradient Boosting Machine,0.7996,0.5871,0.0001,0.2,0.0002,0.0001,0.0028,0.109
dummy,Dummy Classifier,0.7996,0.5,0.0,0.0,0.0,0.0,0.0,0.019
dt,Decision Tree Classifier,0.7994,0.5833,0.0007,0.274,0.0015,0.0004,0.0037,0.026
et,Extra Trees Classifier,0.7994,0.5834,0.001,0.3474,0.0021,0.0008,0.008,0.267


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=2611, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
lr = classification.create_model('lr')

In [39]:
classification.evaluate_model(lr)

In [35]:
classification.save_model(lr, 'lr_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='TARGET',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
                 

In [42]:
df_for_model.drop("TARGET", 1).head(1)

Unnamed: 0,CODE_GENDER,CNT_CHILDREN,FLAG_OWN_CAR,FLAG_OWN_REALTY,class_amt_tot,class_amt_credit,class_amt_ANNUITY,class_age
0,M,0,N,Y,meduim,low,low,jeune


In [None]:
lr_model_saved.predict(df_for_model.drop("TARGET", 1).head(1))

array([0])

In [1]:
import numpy as np
import joblib
lr_model_saved = joblib.load("lr_model.pkl")

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [9]:
d = {}
val = ['M',0,'N','Y','meduim','low','low','jeune']
i = 0
for col in ['CODE_GENDER', 'CNT_CHILDREN', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
'class_amt_tot', 'class_amt_credit', 'class_amt_ANNUITY', 'class_age'] :
    d[col] = val[i]
    i +=1

d

{'CODE_GENDER': 'M',
 'CNT_CHILDREN': 0,
 'FLAG_OWN_CAR': 'N',
 'FLAG_OWN_REALTY': 'Y',
 'class_amt_tot': 'meduim',
 'class_amt_credit': 'low',
 'class_amt_ANNUITY': 'low',
 'class_age': 'jeune'}

In [10]:
import pandas as pd

inputs = pd.DataFrame(d, index=[0])
inputs

Unnamed: 0,CODE_GENDER,CNT_CHILDREN,FLAG_OWN_CAR,FLAG_OWN_REALTY,class_amt_tot,class_amt_credit,class_amt_ANNUITY,class_age
0,M,0,N,Y,meduim,low,low,jeune


In [16]:
1 if inputs.CODE_GENDER.values == "M" else 0

1

In [8]:
lr_model_saved.predict(inputs)

  data.columns = data.columns.str.replace(r"[\,\}\{\]\[\:\"\']", "")


array([0], dtype=int64)

In [21]:
bd = open("donnees_entrees_form.json", "a")
bd.write(inputs)
bd.close()

TypeError: write() argument must be str, not DataFrame

In [19]:
bd = pd.read_json('donnees_entrees_form.json')
pd.concat([bd, inputs],0).reset_index(drop=True).to_json("donnees_entrees_form.json")

inputs.to_csv('donnees_entrees_formulaire.csv')

ValueError: Expected object or value