In [10]:
# data treatment
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings("ignore")
from sklearn.decomposition import PCA
import numpy.matlib

# machine learning
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# deep learning
import tensorflow as tf
from tensorflow import keras

In [11]:
X_train = pd.read_csv("valeo_xtrain.csv")
y_train = pd.read_csv("valeo_ytrain.csv")
X_test = pd.read_csv("valeo_xtest.csv")

In [12]:
X_ind = X_train.columns
y_ind = y_train.columns

In [13]:
train = pd.concat([X_train, y_train], axis=1)

In [14]:
# parameters for the model
train_size = 27086
valid_size = 500
seed = 0

# convert arrays to numpy
temp = train.sample(n=train_size+valid_size, random_state=seed)
gb_X_train = np.array(temp[0:train_size][X_ind])
gb_y_train = np.ravel(np.array(temp[0:train_size][y_ind]))
gb_X_valid = np.array(temp[train_size:train_size+valid_size][X_ind])
gb_y_valid = np.ravel(np.array(temp[train_size:train_size+valid_size][y_ind]))

# split dataset as train/validation: create a list where train indices are -1 and validation indices are 0
split_index = [-1] * train_size + [0] * valid_size
split = PredefinedSplit(test_fold = split_index)

# concatenate train and validation sets (required: the grid will separate them according to split)
gb_X = np.concatenate((gb_X_train, gb_X_valid), axis=0)
gb_y = np.concatenate((gb_y_train, gb_y_valid), axis=0)

In [15]:
# define grid
grid = {'learning_rate':[0.05, 0.1, 0.2, 0.3],
        'max_depth':[3, 6, 9, 12, 15],
        'max_features':['sqrt', "log2", 0.2]}

# run grid search
gb = GradientBoostingClassifier()
grid_search = GridSearchCV(gb, grid, cv=split, scoring='accuracy');
grid_search.fit(gb_X, gb_y);
print(grid_search.best_estimator_)

GradientBoostingClassifier(learning_rate=0.05, max_features='sqrt')


In [16]:
# create model with optimal hyperparameter values
gb = grid_search.best_estimator_;

# save model to file
pickle.dump(gb, open("gb.pickle.dat", "wb"))

In [17]:
# fit model to data
gb.fit(X_train, y_train);

In [18]:
# obtain predictions on the test set
sscore = gb.predict_proba(X_test)[:,1]
# Save the anomaly scores to file
print(sscore.shape)
np.savetxt('ytest_challenge_student.csv', sscore, fmt = '%1.6f', delimiter=',')

(27587,)
