# Défi Grosses Data 2018

## Début Commun pour tous

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
from os import listdir
import Annex

Importe les données concaténées:

In [None]:
df=Annex.get_data_raw()
N_withNA=df.shape[0]

Importe les données séparées par type:

In [None]:
meteo_quant, meteo_qual, meteo_date, meteo_y=Annex.get_data_tidied()

Remplace les variables qualitatives par leur indicatrices. 

**Attention, ici seuls "mois" et "insee" sont considérés qualitatives!** (what about the wind? what about the "ech"?)

In [None]:
df=Annex.convert_month_to_int(df)

In [None]:
df.flir1SOL0=df.flir1SOL0.fillna(0)
df.fllat1SOL0=df.fllat1SOL0.fillna(0)
df.flsen1SOL0=df.flsen1SOL0.fillna(0)
df.flvis1SOL0=df.flvis1SOL0.fillna(0)
df.rr1SOL0=df.rr1SOL0.fillna(0)

In [None]:
df_dummies=pd.get_dummies(df[['insee']])
df_full_qtt=pd.concat([df,df_dummies],axis=1)
df_full_qtt=df_full_qtt.drop(['insee'],axis=1)

Sépare les échantillons d'apprentissage et de test

In [None]:
#df_full_qtt.isnull().values.any()
df_full_qtt.isnull().sum()

In [None]:
df_clean=df_full_qtt.dropna(axis=0)
N_withoutNA=df_clean.shape[0]
print("Nous avons éliminé %d données soit %0.2f %s"%(N_withNA-N_withoutNA,(N_withNA-N_withoutNA)/N_withNA*100,'%'))

In [None]:
from sklearn.model_selection import train_test_split
Y=df_clean['tH2_obs']
X=df_clean
X=X.drop(['tH2_obs'],axis=1) ## !!! Date?
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=11)
X_train.shape, X_test.shape

## Maintenant, faites votre vie!

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler  
# L'algorithme ds réseaux de neurones nécessite éventuellement une normalisation 
# des variables explicatives avec les commandes ci-dessous
date_train=X_train['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
date_test=X_test['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
X_train=X_train.drop(['date'],axis=1)
X_test=X_test.drop(['date'],axis=1)

scaler = StandardScaler()  
scaler.fit(X_train)  
Xnet_train = scaler.transform(X_train)  
# Meme transformation sur le test
Xnet_test = scaler.transform(X_test)
#date_train=np.reshape(date_train,(len(date_train),1))
#date_test=np.reshape(date_test,(len(date_test),1))
#Xnet_train.shape,date_train.shape,type(X_train),type(date_train)

#Xnet_train=np.concatenate((Xnet_train,date_train),axis=1)
#Xnet_test=np.concatenate((Xnet_test,date_test),axis=1)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid=[{"hidden_layer_sizes":list([(50,),(60,),(70,),(80,)])}]
nnet= GridSearchCV(MLPRegressor(max_iter=500),param_grid,cv=10,n_jobs=-1)
nnetOpt=nnet.fit(Xnet_train, Y_train)
# paramètre optimal
print("Meilleur score = %f, Meilleur paramètre = %s" % (1. - nnetOpt.best_score_,nnetOpt.best_params_))

In [None]:
mlp=MLPRegressor(max_iter=500,hidden_layer_sizes =(50,))
nnetOpt=mlp.fit(Xnet_train, Y_train)

In [None]:
# Estimation de l'erreur de prévision sur le test
1-nnetOpt.score(Xnet_test,Y_test)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
# prévision de l'échantillon test
Y_pred = nnetOpt.predict(Xnet_test)
print("MSE =",mean_squared_error(Y_test,Y_pred))
print("R2 =",r2_score(Y_test,Y_pred))

In [None]:
Y_train.shape

In [None]:
Y_train=Y_train.reshape((len(Y_train),1))
Y_test=Y_test.reshape((len(Y_test),1))
import sys
sys.path.append("./libs")
#import generic_evolutionary_algorithm as gea
import NNet

In [None]:
import NNet
import imp
imp.reload(NNet)
n_output=[Y_train.shape[1]]
n_hid=[7]
n_input=[X_train.shape[1]]

type_layer=['logistic','logistic','linear']
threshold=1e-4
max_iter=1000
mini_batch_coef=0.1
do_early_stopping=False

min_training_loss=np.Inf
min_val_loss=np.Inf
min_test_loss=np.Inf

i=0
for alpha in [0.35]:#[0.002,0.01,0.05,0.2,1,5]:
    for mom in [0.9]:
        for wd in [0]:#[0.001,0,1,0.1,10,0.0001]:
            for n_hid in [[15,15]]:#[30,20],,[40,30],[10,15],[20,20]
                i=i+1
                print('==================== Test n°',i,' alpha=',alpha,' mom=',mom,' wd=',wd,' n_hid=',n_hid)
                sizes=np.concatenate([n_input,n_hid,n_output])
                [model,losses]=NNet.build_model(X_train.T, Y_train, wd, sizes, type_layer, 'least_square',
                                                   max_iter, alpha, mom,
                                                   do_early_stopping, mini_batch_coef, threshold,
                                                   print_info=True,train_test_split=False,
                                                   X_test=X_test.T,y_test=Y_test)
                if losses[0]<min_training_loss:
                    min_training_loss=losses[0]
                    min_alpha=alpha
                    momentum=mom
                    wd_t=wd
                    n_hid_t=n_hid
                if losses[1]<min_val_loss:
                    min_val_loss=losses[1]
                    min_alpha_val=alpha
                    momentum_val=mom
                    wd_v=wd
                    n_hid_v=n_hid
                if losses[2]<min_test_loss:
                    min_test_loss=losses[2]
                    min_alpha_test=alpha
                    momentum_test=mom
                    wd_test=wd
                    n_hid_test=n_hid
                    # best_class_perf=class_perf[2]

print(i, ' configurations have been tested' )

print('==== Training test')
print('Min alpha: ',min_alpha)
print('Momentum: ',momentum)
print('Min val loss: ',min_training_loss)
print('WD: ',wd_t)
print('n_hid: ',n_hid_t)

print('==== Validation test')
print('Min alpha: ',min_alpha_val)
print('Momentum: ',momentum_val)
print('Min val loss: ',min_val_loss)
print('WD: ',wd_v)
print('n_hid: ',n_hid_v)

print('==== Generalisation test')
print('Min alpha: ',min_alpha_test)
print('Momentum: ',momentum_test)
print('Min val loss: ',min_test_loss)
print('WD: ',wd_test)
print('n_hid: ',n_hid_test)
# print('Classification performance: ',best_class_perf)

print('\nElapsed time:',time.time()-start)

In [None]:


E = gea.Evolution(50,
      args_pop=(['logistic'],  # activation function for hidden
                               # layers except the last one
                               # (which is linear positive)
                12,  # maximum number of nodes per layer
                3,    # maximum number of layers
                X_train.shape[1],     # number of inputs
                Y_train.shape[1],    # number of outputs
                X_train,        # inputs data
                Y_train,       # targets data
                gea.default_loss_func,
                gea.evaluation_NN_hybrid
                )
      )

trialErrors = E.execute(nb_changes=[10],    # int: number of changes to occur
     # for evolution / list: iterations in which changes
     # will occur (see doc)
     doublons=False,     # wheter to keep doublons or not
     # (see doc)
     similar=False,      # wheter to keep similar neural
     # nets or not (see doc)
     loss_threshold=1e-3,
     echo_level=0)

In [None]:
imp.reload(Annex)
df_TEST=Annex.load_test_set()
#df_TEST.shape, df_TEST.info()
df_TEST=Annex.convert_month_to_int(df_TEST)
df_dummies=pd.get_dummies(df_TEST[['insee']])
df_TEST_full_qtt=pd.concat([df_TEST,df_dummies],axis=1)
df_TEST_full_qtt.flir1SOL0=df_TEST_full_qtt.flir1SOL0.fillna(0)
df_TEST_full_qtt.fllat1SOL0=df_TEST_full_qtt.fllat1SOL0.fillna(0)
df_TEST_full_qtt.flsen1SOL0=df_TEST_full_qtt.flsen1SOL0.fillna(0)
df_TEST_full_qtt.flvis1SOL0=df_TEST_full_qtt.flvis1SOL0.fillna(0)
df_TEST_full_qtt.rr1SOL0=df_TEST_full_qtt.rr1SOL0.fillna(0)
df_TEST_full_qtt=df_TEST_full_qtt.drop(['insee','date'],axis=1)
X_TEST = scaler.transform(df_TEST_full_qtt)  
Y_PRED = nnetOpt.predict(X_TEST)

## Submission

In [None]:
Annex.generate_submission_file('submission_17nov2017_10h30.csv', Y_PRED)