In [None]:
from sys import path
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sklearn as sk
from sklearn.neighbors import LocalOutlierFactor
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

import pickle
from os.path import isfile
from sklearn.base import BaseEstimator

problem_dir = 'ingestion_program/'
path.append(problem_dir);
scoring_dir = 'scoring_program/'
path.append(scoring_dir);
%load_ext autoreload
%autoreload 2

from data_io import read_as_df
data_dir = 'all_data'
data_name = 'xporters'


from libscores import get_metric
metric_name, scoring_function = get_metric()

from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn import svm
from sklearn import tree

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
dataFrame = read_as_df(data_dir  + '/' + data_name)  
dataFrame.head()

In [None]:
data = dataFrame.to_numpy()

In [None]:
def outlierDetection(data):
    clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
    outlier = clf.fit_predict(data)
    countInlier = 0
    for i in outlier:
        if i == 1:
            countInlier = countInlier + 1
    realData = np.ndarray(shape=(countInlier,60))
    count = 0
    for i in range(len(outlier)):
        if outlier[i] == 1:
            realData[count] = data[i]
            count = count + 1
    return realData

def dimensionReduction(nbDimension, data):
    pca = PCA(n_components=nbDimension)
    realData = np.ndarray(shape=(data.shape[0],nbDimension))
    realData = pca.fit_transform(data)
    return realData

def getTarget(data):
    Y = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        Y[i] = data[i,59]
    return Y


def featuresSelection(data, dataTarget, optionalEstimator, max_depth_tree, min_sample_leaf_tree):
    clf = ExtraTreesClassifier(n_estimators=optionalEstimator, max_depth=max_depth_tree, min_samples_leaf=min_sample_leaf_tree) 
    clf = clf.fit(data, dataTarget)
    model = SelectFromModel(clf, threshold="mean", prefit=True)
    realData = model.transform(data)
    return realData

In [None]:
class model (BaseEstimator):
    def __init__(self, modelToUse):
        self.num_train_samples=0
        self.num_feat=1
        self.num_labels=1
        self.is_trained=False
        self.mod = modelToUse
    
    def fit(self, X, y):
        self.num_train_samples = X.shape[0]
        if X.ndim>1: self.num_feat = X.shape[1]
        print("FIT: dim(X)= [{:d}, {:d}]".format(self.num_train_samples, self.num_feat))
        num_train_samples = y.shape[0]
        if y.ndim>1: self.num_labels = y.shape[1]
        print("FIT: dim(y)= [{:d}, {:d}]".format(num_train_samples, self.num_labels))
        if (self.num_train_samples != num_train_samples):
            print("ARRGH: number of samples in X and y do not match!")
        self.mod.fit(X,y)
        self.is_trained = True

    def predict(self, X):
        num_test_samples = X.shape[0]
        if X.ndim>1: num_feat = X.shape[1]
        print("PREDICT: dim(X)= [{:d}, {:d}]".format(num_test_samples, num_feat))
        if (self.num_feat != num_feat):
            print("ARRGH: number of features in X does not match training data!")
        print("PREDICT: dim(y)= [{:d}, {:d}]".format(num_test_samples, self.num_labels))
        y = np.zeros([num_test_samples, self.num_labels])
        y = self.mod.predict(X)
        return y

In [None]:
def set_plot_config():
	sns.set()
	sns.set_style("whitegrid")
	sns.set_context("poster")
	mpl.rcParams['figure.figsize'] = [8.0, 6.0]
	mpl.rcParams['figure.dpi'] = 80
	mpl.rcParams['savefig.dpi'] = 100
	mpl.rcParams['font.size'] = 10
	mpl.rcParams['axes.labelsize'] = 10
	mpl.rcParams['axes.titlesize'] = 17
	mpl.rcParams['ytick.labelsize'] = 10
	mpl.rcParams['xtick.labelsize'] = 10
	mpl.rcParams['legend.fontsize'] = 'large'
	mpl.rcParams['figure.titlesize'] = 'medium'


def plot_test_distrib(y_proba, y_test, save_path, title):
	try:
		sns.distplot(proba[y_test==0, 1], label='b')
		sns.distplot(proba[y_test==1, 1], label='s')
		plt.xlabel('classifier score')
		plt.ylabel('density')
		plt.title(title)
		plt.legend()
		plt.savefig(save_path)
		plt.clf()
	except Exception as e:
		print('[WARNING] Plot test distrib failed')
		print('[WARNING] ', str(e))


def plot_scores(scores, scores_std, save_path, title):
	xx = np.arange(len(scores))
	try:
		plt.errorbar(xx, scores, yerr=scores_std, fmt='o',
		capsize=20, capthick=2, label='scores')
		plt.xlabel('iter num')
		plt.ylabel('scores')
		plt.title(title)
		plt.legend()
		plt.savefig(save_path)
		plt.clf()
	except Exception as e:
		print('[WARNING] Plot scores failed')
		print('[WARNING] ', str(e))

In [None]:
# Partie préprocessing

# Ici j'ai mis que le outlierDetection mais libre à toi de check avec ou sans les autres ou sans rien pour voir le plus optimal

data = outlierDetection(data)

# @param nbDimension = nombre de dimensions que tu veux à la fin, data = array avec les donées

# data = dimensionReduction(nbDimension, data)

# @param data = array avec les donées, dataTarget = les targets des données (donné avec getTarget ;) ) 
#        optionalEstimator = nombre d'arbres dans la forêt (10 - 100), max_depth_tree = Profondeur de l'arbre (Attention ! Trop grand = problème)
#        min_sample_leaf_tree = nombres de valeurs par feuilles (Attention ! trop petit = problème)

# Target = getTarget(data)

# data = featuresSelection(data, dataTarget, optionalEstimator, max_depth_tree, min_sample_leaf_tree)

In [None]:
# Partie Model

M = model(RandomForestRegressor())
M.fit(data, getTarget(data))
score = scoring_function(getTarget(data), M.predict(data))
print('score = %5.4f' % score)

In [None]:
# Parite visualisation

