In [1]:
#importation les bibliotheques
import pandas as pd
import numpy as np
import scipy
import itertools
import matplotlib.pyplot as plt 
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


In [2]:
df = pd.read_csv("/Users/apple/Documents/application_train.csv")
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


## [Problème 1] Validation

Nous allons preparer les données .
- Nous allons selectionner les colonnes categorielle. Pour savoir lesquelles sont coherentes avec le TARGET.
- Nous allons ensuite selectioinner les valeurs les plus coherentes pour simplifier le jeu de donnees.

#### Traitement des valeurs categorielles

In [3]:
df_obj = df.select_dtypes("object")

# supprimer les valeurs manquantes
df_obj = df_obj.dropna(axis=1)

# Nous allons aussi enveler ses lignes du TARGET
df_obj["TARGET"] = df["TARGET"]

#La `XNA` semble etre une valeur marginale. Nous allons supprimer ces valeurs pour garder seulement les `F` et `M`.
#selectionner uniquement les valeurs binaires
df_obj.drop(df_obj.loc[df_obj['CODE_GENDER']=='XNA'].index, inplace=True)

# transformer ses valeurs en valeurs numerique
col = []
for c in df_obj.columns:
    if df_obj[c].nunique()==2:
        col.append(c)
        
df_obj = df_obj[col]

X = df_obj.drop("TARGET",axis=1)


le = LabelEncoder()
for c in X.columns:
    le.fit(X[c])
    X[c] = le.transform(X[c])

#Nous allons transformer ces valeurs categorielles en valeurs numerique afin de calculer les coorelations.
X= pd.get_dummies(X)

X["TARGET"] = df["TARGET"]

#calculer la matriec de correlation
corr_matrix = (X.corr())
corr_matrix

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,TARGET
NAME_CONTRACT_TYPE,1.0,-0.008755,0.004009,0.067155,-0.030886
CODE_GENDER,-0.008755,1.0,0.345848,-0.044374,0.05471
FLAG_OWN_CAR,0.004009,0.345848,1.0,-0.00282,-0.02185
FLAG_OWN_REALTY,0.067155,-0.044374,-0.00282,1.0,-0.006146
TARGET,-0.030886,0.05471,-0.02185,-0.006146,1.0


La matrice de correlation montrent que les variables categorielles sont tres significatives pour expliquer le TARGET. On remarque que le `FLAG_OWN_CAR` et `CODE_GENDER` on la plus grande correlation entre eux.
Nous allons retenir `CODE_GENDER` dans la construction de notre modele d'apprentissage automatique.

#### Traitement des variables numeriques

In [4]:
#selectionner les valeurs numeriques
df_num = df.select_dtypes('number')

# supprimer des colonnes qui ont plus de 10% de valeurs manquantes
total = (df_num.isnull().sum(axis=1)).sort_values(ascending=False) #total des valeurs manquantes
percent = (total/df_num.isnull().count()).sort_values(ascending=False) #pourcentage des valeurs manquantes
missing_data = pd.concat([total,percent], axis=1,keys=["Total","percent"]) # le frame des valeurs manquantes

#supprimer les colonnes si le pourcentage depasse 10
df_num = df_num.drop(missing_data[missing_data['percent'] > 0.2].index,1)

#remplacer les valeurs nulle par une valeur tres absurdes
df_clean = df_num.fillna(-1)

#matrice de correlation pour voir les variables les plus significatives
corrs = df_clean.corr()
m_corr = corrs[["TARGET"]].sort_values(by=['TARGET'], ascending=False) 
#selection les 20 variables les plus correlé avec le TARGET

features = m_corr[1:21].index

X = df_clean[features]
y = df["TARGET"]

In [5]:
#implementation de cross validation
#nous faisons 5 sous ensemble par defaut
k = 5
kf = KFold(n_splits=k, random_state=None)
model = LogisticRegression(solver= "liblinear")
acc_score = []

for train_index, test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

accuracy of each fold - [0.9198738272929776, 0.9172709830574616, 0.9188969464407661, 0.9197261877662515, 0.9205879483594029]
Avg accuracy : 0.9192711785833719


## [Problem 2] Grid search

In [6]:
# standadiser les donnees
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=50)

#standardize
scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Grid search cross validation
grid={"C":np.logspace(-3,3,10)}
model = LogisticRegression()
model_cv = GridSearchCV(model,grid,cv=10)
model_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",model_cv.best_params_)
print("accuracy :",model_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 0.001}
accuracy : 0.9189666305274565


In [7]:
model2 = LogisticRegression(C=0.001)
model2.fit(X_train,y_train)
print("score",model2.score(X_test,y_test))

score 0.9199709497691158


## [Problem 3] Survey from Kaggle Notebooks


- le réglage des paramètres est principalement lié à la stabilité de votre modèle dans vos ensembles d'entraînement, de test et de validation, ce qui est évidemment très important car cela vous donne en fait une indication sur la manière dont votre modèle se comportera dans les données de test. C'est pourquoi le réglage des paramètres doit être fait judicieusement.

- la recherche aléatoire devrait dans un cas général être préférée à une recherche par grille, à moins que vous ne sachiez exactement ce que vous voulez comparer (et dans ce cas, il ne s'agirait probablement pas d'une simple grille, mais d'un ensemble de plusieurs petites sous-grilles)

- The process of hyperparameter tuning (also called hyperparameter optimization) means finding the combination of hyperparameter values for a machine learning model that performs the best - as measured on a validation dataset - for a problem.
https://www.kaggle.com/willkoehrsen/intro-to-model-tuning-grid-and-random-search

## [Problem 4] Creating a model with high generalization performance


In [8]:
# source : https://www.kaggle.com/crawford/hyperparameter-search-comparison-grid-vs-random

# CHERCHEZ FOR PARAMETERS
def cherchez(estimator, param_grid, search):
    """
    This is a helper function for tuning hyperparameters using teh two search methods.
    Methods must be GridSearchCV or RandomizedSearchCV.
    Inputs:
        estimator: Logistic regression, SVM, KNN, etc
        param_grid: Range of parameters to search
        search: Grid search or Randomized search
    Output:
        Returns the estimator instance, clf
    
    """   
    try:
        if search == "grid":
            clf = GridSearchCV(
                estimator=estimator, 
                param_grid=param_grid, 
                scoring=None,
                n_jobs=-1, 
                cv=10, 
                verbose=0,
                return_train_score=True
            )
        elif search == "random":           
            clf = RandomizedSearchCV(
                estimator=estimator,
                param_distributions=param_grid,
                n_iter=10,
                n_jobs=-1,
                cv=10,
                verbose=0,
                random_state=1,
                return_train_score=True
            )
    except:
        print('Search argument has to be "grid" or "random"')
        sys.exit(0)
        
    # Fit the model
    clf.fit(X=X_train, y=y_train)
    
    return clf   

### Models being tested
1. Logisitc Regresison
    Using Grid search and Randomized search for tuning hyperparameters

In [9]:
# Logistic Regression
# Paramaters
logreg_params = {} 
logreg_params["C"] =  [0.01, 0.1, 10, 100]
logreg_params["fit_intercept"] =  [True, False]
logreg_params["warm_start"] = [True,False]
logreg_params["random_state"] = [1]

lr_dist = {}
lr_dist["C"] = scipy.stats.expon(scale=.01)
lr_dist["fit_intercept"] =  [True, False]
lr_dist["warm_start"] = [True,False]
lr_dist["random_state"] = [1]

logregression_grid = cherchez(LogisticRegression(), logreg_params, search="grid")
acc = accuracy_score(y_true=y_test, y_pred=logregression_grid.predict(X_test))
cfmatrix_grid = confusion_matrix(y_true=y_test, y_pred=logregression_grid.predict(X_test))
print("**Grid search results**")
print("Best training accuracy:\t", logregression_grid.best_score_)
print("Test accuracy:\t", acc)

logregression_random = cherchez(LogisticRegression(), lr_dist, search="random")
acc = accuracy_score(y_true=y_test, y_pred=logregression_random.predict(X_test))
cfmatrix_rand = confusion_matrix(y_true=y_test, y_pred=logregression_random.predict(X_test))
print("**Random search results**")
print("Best training accuracy:\t", logregression_random.best_score_)
print("Test accuracy:\t", acc)

**Grid search results**
Best training accuracy:	 0.9189666305274565
Test accuracy:	 0.9199709497691158
**Random search results**
Best training accuracy:	 0.9189666305274565
Test accuracy:	 0.9199709497691158


In [10]:
# Get default hyperparameters
model = lgb.LGBMClassifier()
default_params = model.get_params()

# Remove the number of estimators because we set this to 10000 in the cv call
del default_params['n_estimators']

# Create a training and testing dataset
train_set = lgb.Dataset(data = X_train)
test_set = lgb.Dataset(data = X_test)

#cross validation with 5 folds
N_FOLDS = 5
# Cross validation with early stopping
cv_results = lgb.cv(default_params, 
                    train_set, num_boost_round = 10000, 
                    early_stopping_rounds = 100, 
                    metrics = 'auc', 
                    nfold = N_FOLDS, seed = 42)

Please use silent argument of the Dataset constructor to pass this parameter.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1157
[LightGBM] [Info] Number of data points in the train set: 172205, number of used features: 20
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1157
[LightGBM] [Info] Number of data points in the train set: 172205, number of used features: 20
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1157
[LightGBM] [Info] Number of data points in the train set: 172206, number of used features: 20
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1157
[LightGBM] [Info] Number of data points in the train set: 172206, number of used features: 20
















In [11]:
print('The maximum validation ROC AUC was: {:.5f} with a standard deviation of {:.5f}.'.format(cv_results['auc-mean'][-1], cv_results['auc-stdv'][-1]))
print('The optimal number of boosting rounds (estimators) was {}.'.format(len(cv_results['auc-mean'])))

The maximum validation ROC AUC was: 1.00000 with a standard deviation of 0.00000.
The optimal number of boosting rounds (estimators) was 1.


Nous allons utiliser le model de regression avec l'hyper paramettre a C = 0.001. Ce model donne un tres bon score.

### [Problem 5] Final model selection


In [25]:
# charger la data 
df_test = pd.read_csv('/Users/apple/Documents/application_test.csv')

#selectionner les valeurs numeriques
df_nb = df_test.select_dtypes('number')

# supprimer des colonnes qui ont plus de 10% de valeurs manquantes
total = (df_nb.isnull().sum(axis=1)).sort_values(ascending=False) #total des valeurs manquantes
percent = (total/df_nb.isnull().count()).sort_values(ascending=False) #pourcentage des valeurs manquantes
missing_data = pd.concat([total,percent], axis=1,keys=["Total","percent"]) # le frame des valeurs manquantes

#supprimer les colonnes si le pourcentage depasse 10
df_nb = df_nb.drop(missing_data[missing_data['percent'] > 0.2].index,1)

#remplacer les valeurs nulle par une valeur tres absurdes
df_cleaned = df_nb.fillna(-1)


X_test = df_cleaned[features]


# standardisation
scaler = StandardScaler()
X_trans = scaler.fit_transform(X_test)


# Paramaters
logreg_params = {} 
logreg_params["C"] =  [0.01, 0.1, 10, 100]
logreg_params["fit_intercept"] =  [True, False]
logreg_params["warm_start"] = [True,False]
logreg_params["random_state"] = [1]

mdl = cherchez(LogisticRegression(), lr_dist, search="random")
y_pred=mdl.predict(X_trans)

In [23]:
# preparer la soumission sur Kaggle
ids = df_test['SK_ID_CURR']
ids = ids.astype(int)

submission = pd.DataFrame({'SK_ID_CURR': ids, 'TARGET': y_pred})
submission.to_csv("./submission.csv", index=False)

In [26]:
submission

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0
1,100005,0
2,100013,0
3,100028,0
4,100038,0
...,...,...
48739,456221,0
48740,456222,0
48741,456223,0
48742,456224,0
