In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.utils import resample

from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
clf = GradientBoostingClassifier(criterion='mse', learning_rate=0.01, loss='deviance', max_depth=3, max_features='sqrt', n_estimators=1000)

# Data normal

In [None]:
X_train = pd.read_csv("valeo_xtrain.csv",header=None,skiprows=[0])
y_train = pd.read_csv("valeo_ytrain.csv").values.ravel()
X_test = pd.read_csv("valeo_xtest.csv",header=None,skiprows=[0])
X_train.shape,y_train.shape,X_test.shape

# Data resample

In [3]:
X_train = pd.read_csv("valeo_xtrain.csv",header=None,skiprows=[0])
y_train = pd.read_csv("valeo_ytrain.csv").values.ravel()
X_test = pd.read_csv("valeo_xtest.csv",header=None,skiprows=[0])
X_train['target'] = y_train
df_majority = X_train[X_train.target==0]
df_minority = X_train[X_train.target==1]
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=df_majority.shape[0],    
                                 random_state=0)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
y_train = df_upsampled.target
X_train = df_upsampled.drop('target', axis=1)

# Test d'origine

In [None]:
from sklearn.linear_model import LogisticRegression

# Train the anomaly detector
clf = LogisticRegression(solver='liblinear', max_iter=1000)
score = cross_val_predict(clf, X_train, y_train, cv=3, method='decision_function')
score
#roc_auc_score(y_train, score)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train the anomaly detector
clf = RandomForestClassifier()
score = cross_val_predict(clf, X_train, y_train, cv=3, method='predict_proba')
score
#roc_auc_score(y_train, score)

# Avec le meilleur modèle

In [None]:
clf = GradientBoostingClassifier()
clf_dec_fct = cross_val_predict(clf, X_train, y_train, cv=10, method='decision_function')
roc_auc_score(y_train, clf_dec_fct)

# Choix du modèle avec les meilleurs param

In [None]:
clf = GradientBoostingClassifier(criterion='mse', learning_rate=0.01, loss='deviance', max_depth=3, max_features='sqrt', n_estimators=1000)
clf_dec_fct = cross_val_predict(clf, X_train, y_train, cv=10, method='decision_function')
roc_auc_score(y_train, clf_dec_fct)

### Soumission1  : 0.7893625866
### Soumission2 : 0.794577212266

# Retrait de var par corrélation à 90%

In [None]:
X_train = pd.read_csv("valeo_xtrain.csv",header=None,skiprows=[0])
X_test = pd.read_csv("valeo_xtest.csv",header=None,skiprows=[0])
X_train = X_train[[0,1,2,3,4,6,8,11,12,15,16,17,18,19,20,21,22,23,24,25,26]]
X_test = X_test[[0,1,2,3,4,6,8,11,12,15,16,17,18,19,20,21,22,23,24,25,26]]
X_train.shape

In [None]:
clf_dec_fct = cross_val_predict(clf, X_train, y_train, cv=10, method='decision_function')
roc_auc_score(y_train, clf_dec_fct)

### Soumission1 : 0.791044780761
### Soumission2 :0.795053973895

# Retrait de var par corrélation à 80%

In [None]:
X_train = pd.read_csv("valeo_xtrain.csv",header=None,skiprows=[0])
X_train = X_train[[0,1,2,3,4,6,8,15,16,17,18,19,20,21,25,26]]
X_test = pd.read_csv("valeo_xtest.csv",header=None,skiprows=[0])
X_test = X_test[[0,1,2,3,4,6,8,15,16,17,18,19,20,21,25,26]]
X_train.shape

In [None]:
clf_dec_fct = cross_val_predict(clf, X_train, y_train, cv=10, method='decision_function')
roc_auc_score(y_train, clf_dec_fct)

### Soumission1 : 0.791105632919
### Soumission2 : 0.796936969811

# Par choix des var par RFE (2e choix, cv=10)

In [None]:
X_train = pd.read_csv("valeo_xtrain.csv",header=None,skiprows=[0])
X_train = X_train[[25, 19, 17, 21, 18, 14, 11, 20, 4, 0, 8, 22, 26]]
X_train.shape

In [None]:
clf_dec_fct = cross_val_predict(clf, X_train, y_train, cv=10, method='decision_function')
roc_auc_score(y_train, clf_dec_fct)

### Soumission :  0.788268802747

# Choix des variables par SelectFromModel saga opti
[14, 15, 16, 17, 18, 20, 21, 25, 26]

In [None]:

X_train = pd.read_csv("valeo_xtrain.csv",header=None,skiprows=[0])
X_test = pd.read_csv("valeo_xtest.csv",header=None,skiprows=[0])

selector = SelectFromModel(estimator=LogisticRegression(
    max_iter=1000, penalty='none', solver='saga')).fit(X_train, y_train)

X_train = X_train.loc[:, selector.get_support()]
X_test = X_test.loc[:, selector.get_support()]

In [None]:
clf_dec_fct = cross_val_predict(clf, X_train, y_train, cv=10, method='decision_function')
roc_auc_score(y_train, clf_dec_fct)

### Soumission : 0.800773662108

# Choix des variables par SelectFromModel liblinear opti

In [None]:
X_train = pd.read_csv("valeo_xtrain.csv",header=None,skiprows=[0])
X_test = pd.read_csv("valeo_xtest.csv",header=None,skiprows=[0])
selector = SelectFromModel(estimator=LogisticRegression(
    max_iter=1000, penalty='l1', solver='liblinear')).fit(X_train, y_train)

X_train = X_train.loc[:, selector.get_support()]
X_test = X_test.loc[:, selector.get_support()]

In [None]:
clf_dec_fct = cross_val_predict(clf, X_train, y_train, cv=10, method='decision_function')
roc_auc_score(y_train, clf_dec_fct)

### Soumission : 0.794990115412

# Resampling des données avant entrainement

In [None]:
X_train = pd.read_csv("valeo_xtrain.csv",header=None,skiprows=[0])
y_train = pd.read_csv("valeo_ytrain.csv").values.ravel()
X_test = pd.read_csv("valeo_xtest.csv",header=None,skiprows=[0])
X_train['target'] = y_train
df_majority = X_train[X_train.target==0]
df_minority = X_train[X_train.target==1]
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=df_majority.shape[0],    
                                 random_state=0)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
y_train = df_upsampled.target
X_train = df_upsampled.drop('target', axis=1)

In [None]:
clf = GradientBoostingClassifier(criterion='mse', learning_rate=0.01, loss='deviance', max_depth=3, max_features='sqrt', n_estimators=1000)
clf_dec_fct = cross_val_predict(clf, X_train, y_train, cv=10, method='decision_function')
roc_auc_score(y_train, clf_dec_fct)

### Soumission : 0.794224746613

# Resampling + Var selectionné par FSM (ancienne version)
[14, 15, 16, 17, 18, 20, 21, 25, 26]

In [None]:
X_train = pd.read_csv("valeo_xtrain.csv",header=None,skiprows=[0])
y_train = pd.read_csv("valeo_ytrain.csv").values.ravel()
X_train['target'] = y_train
df_majority = X_train[X_train.target==0]
df_minority = X_train[X_train.target==1]
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=df_majority.shape[0],    
                                 random_state=0)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
y_train = df_upsampled.target
X_train = df_upsampled.drop('target', axis=1)

In [None]:
X_train = X_train[[14, 15, 16, 17, 18, 20, 21, 25, 26]]
X_test = X_test[[14, 15, 16, 17, 18, 20, 21, 25, 26]]

In [None]:
clf_dec_fct = cross_val_predict(clf, X_train, y_train, cv=10, method='decision_function')
roc_auc_score(y_train, clf_dec_fct)

In [None]:
X2 = X_train
y2 = y_train

### Soumission : 0.799098413594

# Resampling + Var selectionné par FSM 

In [None]:
X_train = pd.read_csv("valeo_xtrain.csv",header=None,skiprows=[0])
y_train = pd.read_csv("valeo_ytrain.csv").values.ravel()
X_test = pd.read_csv("valeo_xtest.csv",header=None,skiprows=[0])
X_train['target'] = y_train
df_majority = X_train[X_train.target==0]
df_minority = X_train[X_train.target==1]
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=df_majority.shape[0],    
                                 random_state=0)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
y_train = df_upsampled.target
X_train = df_upsampled.drop('target', axis=1)


In [None]:
selector = SelectFromModel(estimator=LogisticRegression(
    max_iter=1000, penalty='l1', solver='liblinear')).fit(X_train, y_train)

X_train = X_train.loc[:, selector.get_support()]
X_test = X_test.loc[:, selector.get_support()]

In [None]:
selector.get_support()

In [None]:
clf_dec_fct = cross_val_predict(clf, X_train, y_train, cv=10, method='predict_proba')
roc_auc_score(y_train, clf_dec_fct)

### Soumission : 0.79

# Modèle XGBoost
Avec les variables sélectionné 

In [None]:
X_train = pd.read_csv("valeo_xtrain.csv",header=None,skiprows=[0])
y_train = pd.read_csv("valeo_ytrain.csv").values.ravel()
X_test = pd.read_csv("valeo_xtest.csv",header=None,skiprows=[0])
X_train['target'] = y_train
df_majority = X_train[X_train.target==0]
df_minority = X_train[X_train.target==1]
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=df_majority.shape[0],    
                                 random_state=0)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
y_train = df_upsampled.target
X_train = df_upsampled.drop('target', axis=1)

In [None]:
X_train = X_train[[25,19,18,11,8,20,0,2,17,22,24,1,3,10,4,21,13]]
X_test = X_test[[25,19,18,11,8,20,0,2,17,22,24,1,3,10,4,21,13]]

In [None]:
boost = XGBClassifier(use_label_encoder=False)
boost.fit(X_train, y_train)
y_pred = cross_val_predict(boost, X_train, y_train, cv=10)
roc_auc_score(y_train, y_pred)

In [None]:
roc_auc_score(y_train, y_pred)

In [None]:
X_train

In [None]:
submit = boost.fit(X_train,y_train).predict_proba(X_test)[:,1]
submit

### Soumission : 0.7

# File to submit

In [4]:
fitted_clif = clf.fit(X_train,y_train)

In [5]:
submit = clf.decision_function(X_test)
submit2 = clf.predict_proba(X_test)
pred = clf.predict(X_test)

In [9]:
sol = clf.predict_proba(X_test)[:,1]


In [6]:
submit[:5],submit2[:5],pred[:5]

(array([-0.87322559,  0.4301853 , -0.42050539, -1.22861291, -1.91408143]),
 array([[0.70541643, 0.29458357],
        [0.39408208, 0.60591792],
        [0.60360418, 0.39639582],
        [0.77357571, 0.22642429],
        [0.87147698, 0.12852302]]),
 array([0, 1, 0, 0, 0], dtype=int64))

In [None]:
submit3 = clf.fit(X_train,y_train).predict_proba(X_test)
submit3

In [None]:
for (i,j,k) in zip(submit,submit2,submit3) :
    print (i,j,k)

In [None]:
# Save the anomaly scores to file
print(submit.shape)
np.savetxt('ytest_challenge_calligaro.csv', submit, fmt = '%1.6f', delimiter=',')