# DATA Analysis

In [1]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.sparse import *
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import pandas as pd
import csv
from keras.regularizers import l2, activity_l2
from keras.models import Sequential, model_from_json
from keras.models import load_model
from keras.optimizers import SGD
from keras.optimizers import adam
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import Model
from keras.constraints import maxnorm
from keras.wrappers.scikit_learn import KerasClassifier


Using TensorFlow backend.


## GloVe Embedding

In [2]:
X=np.load('tweets_emb_nltk.npy')
y=np.load('tweets_sol_nltk.npy')
##-------full data
X_full=np.load('tweets_emb_nltk_full.npy')
y_full=np.load('tweets_sol_nltk_full.npy')
#----test data to give
X_test=np.load('tweets_emb_nltk_test.npy')
print(X.shape)
print(y.shape)

(199715, 20)
(199715,)


In [3]:
##--- Step 1
X1=preprocessing.scale(X)
X1_full=preprocessing.scale(X_full)
X1_test=preprocessing.scale(X_test)

In [4]:
## To get even a smaller train set
X_train,X_test,y_train,y_test=train_test_split(X1,y,test_size=0.6,random_state=1)
print(X_train.shape)
print(y_train.shape)

(79886, 20)
(79886,)


### Neural Net

#### Grid Search

In [5]:
###FOR SGD THINGS for GloVe embedding
##maybe add W_regularizer=l2(0.01)
def create_model_glove(neurons=15,neurons2=5,
                     init_mode='he_uniform',activation='sigmoid',
                     learn_rate=0.1,momentum=0.5,dropout_rate=0.3,
                     weight_constraint=0, weight_regularizer=0.0001):
    
    ## the input layer must have the input_dim numbers of input
    # create model
    model = Sequential()
    model.add(Dense(neurons, input_dim=20, init='he_uniform', activation=activation,W_regularizer=l2(weight_regularizer)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons2, init='he_uniform', activation=activation))
    model.add(Dense(1, init='he_uniform', activation=activation))
    # Compile model
    optimizer = SGD(lr=learn_rate, momentum=momentum)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    #categorical_crossentropy
    return model

In [None]:
# GRID SEARCH Learn rate and momentum
model = KerasClassifier(build_fn=create_model_glove, nb_epoch=10, batch_size=20, verbose=0)
# define the grid search parameters
learn_rate = [0.3,0.5,1]
momentum = [ 0,0.1,0.5]
param_grid = dict(learn_rate=learn_rate, momentum=momentum)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
#How to Tune the Neuron Activation Function
model = KerasClassifier(build_fn=create_model_sgd, nb_epoch=10, batch_size=100, verbose=0)
activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
param_grid = dict(activation=activation)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



In [None]:
### How to Tune the Number of Neurons in the Hidden Layer
## create model
model = KerasClassifier(build_fn=create_model_glove, nb_epoch=10, batch_size=20, verbose=0)
# define the grid search parameters
neurons = [5,10,15,20]
neurons2= [5]
param_grid = dict(neurons=neurons,neurons2=neurons2)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
scores = [x[1] for x in grid.grid_scores_]
scores = np.asarray(scores).reshape(4, 1)
stds = np.asarray(grid_result.cv_results_['std_test_score'])
stds = stds.reshape(4, 1)
neurons=np.asarray(neurons).reshape(4, 1)
print(neurons.shape)
plt.scatter(neurons, scores)#,stds)#, label='N2: ' + str(i))
plt.errorbar(neurons,scores,yerr=stds, linestyle="None")
plt.legend()
plt.xlabel('Neurons on layer 1')
plt.ylabel('Mean score')
plt.grid()
plt.show()


In [None]:
scores = [x[1] for x in grid.grid_scores_]
scores = np.asarray(scores).reshape(5, 1)
stds = np.asarray(grid_result.cv_results_['std_test_score'])
stds = stds.reshape(5, 1)

neurons=np.asarray(neurons).reshape(5, 1)
print(neurons.shape)
plt.scatter(neurons, scores)#,stds)#, label='N2: ' + str(i))
plt.errorbar(neurons,scores,yerr=stds, linestyle="None")
plt.legend()
plt.xlabel('Neurons on layer 1')
plt.ylabel('Mean score')
plt.grid()
plt.show()

In [None]:
##Dropout rate & Weight constraint
model = KerasClassifier(build_fn=create_model_glove, nb_epoch=10, batch_size=20, verbose=0)
# define the grid search parameters
weight_constraint = [0]
weight_regularizer= [0.00001,0.0001,0.001,0.01,0.1,1]
dropout_rate = [0.3]
param_grid = dict(dropout_rate=dropout_rate,weight_regularizer=weight_regularizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# summarize results
scores = [x[1] for x in grid.grid_scores_]
scores = np.asarray(scores).reshape(6, 1)
stds = np.asarray(grid_result.cv_results_['std_test_score'])
stds = stds.reshape(6, 1)

neurons=np.asarray(weight_regularizer).reshape(6, 1)
plt.scatter(weight_regularizer, scores)#,stds)#, label='N2: ' + str(i))
plt.errorbar(weight_regularizer,scores,yerr=stds, linestyle="None")
plt.legend()
plt.xlabel('L2 Regularizer parameter')
plt.ylabel('Mean score')
plt.grid()
plt.xscale('log', nonposy='clip')
plt.show()


In [None]:
## INITIALISATION OF W
#create model
model = KerasClassifier(build_fn=create_model, nb_epoch=10, batch_size=100, verbose=0)
# define the grid search parameters
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
param_grid = dict(init_mode=init_mode)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

### Word2Vec

#### Neural Nets

In [None]:
#with data from word2vec
X=np.load('X_w2v.npy')
y=np.load('y_w2v.npy')

X_full=np.load('X_w2v_full.npy')
y_full=np.load('y_w2v_full.npy')
#
X_test=np.load('X_test_w2v.npy')
print(X.shape)
print(y.shape)

In [None]:
y_rendu=[]
for i in range(len(y)):
    if y[i]>= 0.5:
        y_rendu.append(1)
    else: y_rendu.append(0)
y=np.array(y_rendu)
y_rendu2=[]
for i in range(len(y_full)):
    if y_full[i]>= 0.5:
        y_rendu2.append(1)
    else: y_rendu2.append(0)
y_full=np.array(y_rendu2)

In [None]:
##----Step1: PCA
##principal component via Singular value decompo
pca = PCA(n_components=50,svd_solver='full')
pca.fit(X)
X1=pca.transform(X)
X1_test=pca.transform(X_test)
##print the percentage of variance captured with those features
print(sum(pca.explained_variance_ratio_))
print(X1.shape)

In [None]:
X1_full=pca.transform(X_full)

In [None]:
##--- Step2 : scaling data

X2=preprocessing.scale(X1)
X2_test=preprocessing.scale(X1_test)
X2_full=preprocessing.scale(X1_full)

In [None]:
seed = 3
np.random.seed(seed)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X2,y,test_size=0.6,random_state=1)
print(X_train.shape)
print(y_train.shape)

In [None]:
###FOR SGD THINGS for GloVe embedding
##maybe add W_regularizer=l2(0.01)
def create_model_w2v(neurons=1000,neurons2=100,  ##20 et 10
                     init_mode='he_uniform',activation='sigmoid',
                     learn_rate=0.1,momentum=0.5,dropout_rate=0.2,
                     weight_constraint=2,weight_regularizer=2):
    
    ## the input layer must have the input_dim numbers of input
    # create model
    model = Sequential()
    model.add(Dense(neurons, input_dim=50, init='he_uniform', activation=activation,W_regularizer=l2(weight_regularizer),W_constraint=maxnorm(weight_constraint)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons2, init='he_uniform', activation=activation))
    model.add(Dense(1, init='he_uniform', activation=activation))
    # Compile model
    optimizer = SGD(lr=learn_rate, momentum=momentum)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    #categorical_crossentropy
    return model

In [None]:
##Dropout rate & Weight constraint
model = KerasClassifier(build_fn=create_model_w2v, nb_epoch=5, batch_size=20, verbose=0)
# define the grid search parameters
weight_constraint = [0,1,2]
dropout_rate = [0,0.3]
weight_regularizer=[0,1,2,3]
param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint,weight_regularizer=weight_regularizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# GRID SEARCH Learn rate and momentum
model = KerasClassifier(build_fn=create_model_w2v, nb_epoch=10, batch_size=20, verbose=0)
# define the grid search parameters
learn_rate = [0.3,0.5,1]
momentum = [ 0,0.1,0.5]
param_grid = dict(learn_rate=learn_rate, momentum=momentum)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
### How to Tune the Number of Neurons in the Hidden Layer
## create model
model = KerasClassifier(build_fn=create_model_w2v, nb_epoch=5, batch_size=20, verbose=0)
# define the grid search parameters
neurons = [20,1000]
neurons2= [5,100]
param_grid = dict(neurons=neurons,neurons2=neurons2)
grid = GridSearchCV(estimator=model, param_grid=param_grid,cv=2, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# summarize results
scores = [x[1] for x in grid.grid_scores_]
scores = np.asarray(scores).reshape(7, 1)
stds = np.asarray(grid_result.cv_results_['std_test_score'])
stds = stds.reshape(7, 1)

neurons=np.asarray(neurons).reshape(7, 1)
print(neurons.shape)
plt.scatter(neurons, scores)#,stds)#, label='N2: ' + str(i))
plt.errorbar(neurons,scores,yerr=stds, linestyle="None")
plt.legend()
plt.xlabel('Neurons on layer 1')
plt.ylabel('Mean score')
plt.grid()
plt.show()


In [None]:
##Dropout rate & Weight constraint
model = KerasClassifier(build_fn=create_model_w2v, nb_epoch=5, batch_size=20, verbose=0)
# define the grid search parameters
weight_constraint = [2,3]
dropout_rate = [0.2,0.3]
param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X2, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Create output

In [None]:
def save_model(model_tt):
    # saving model
    json_model = model_tt.model.to_json()
    open('model_architecture.json', 'w').write(json_model)
    # saving weights
    model_tt.model.save_weights('model_weights.h5', overwrite=True) 

def load_model():
    # loading model
    model = model_from_json(open('model_architecture.json').read())
    model.load_weights('model_weights.h5')
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [None]:
# Fit the model
mybestmodel = KerasClassifier(build_fn=create_model_glove)
mybestmodel.fit(X1_full, y_full,validation_split=0.2, nb_epoch=50, batch_size=20)  #can add verbose=0 for no wait bar
save_model(mybestmodel)

In [None]:
### Cross Valid on full data:
mybestmodel.score(X2_full,y_full)

In [None]:
mybestmodel=load_model()
mybestmodel.evaluate(X2_full,y_full)

In [None]:
y_pred = mybestmodel.predict(X2_test)

In [None]:
y_rendu=[]
for i in range(len(y_pred)):
    if y_pred[i]>= 0.5:
        y_rendu.append(1)
    else: y_rendu.append(-1)
        
OUTPUT_PATH = 'prediction.csv' 
ids_test=[i+1 for i in range(len(y_rendu))]
create_csv_submission(ids_test, y_rendu, OUTPUT_PATH)