# Convolutionnal NN

In [None]:
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D
import numpy as np
import pickle
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.sparse import *
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from keras.optimizers import SGD
from keras.models import Model, Sequential
from keras.constraints import maxnorm
from keras.preprocessing import sequence
from keras.wrappers.scikit_learn import KerasClassifier
import pandas as pd
import csv
from keras.models import model_from_json

In [None]:
##------Load data small samples
X=np.load('tweets_emb_nltk_conv.npy')
y=np.load('tweets_sol_nltk.npy')
##-------full data
X_full=np.load('tweets_emb_nltk_conv_full.npy')
y_full=np.load('tweets_sol_nltk_full.npy')
#----test data to give
X_test=np.load('tweets_emb_nltk_conv_test.npy')
##--- Step 1
#X1_full=preprocessing.scale(X_full)
#X1_test=preprocessing.scale(X_test)

with open('vocab_nltk.pkl', 'rb') as f:
    vocabulary = pickle.load(f)
embedding_weights=np.load('embeddings_nltk.npy')
embedding_weights = embedding_weights[np.newaxis, :, :]
print(embedding_weights.shape)
#embedding_weights= None

print(X_full.shape)
print(y_full.shape)


In [None]:
## find the longest word sequence in the tweets
sequence_length=0
for i in range(len(y_full)):
    if len(X_full[i])>55:
        print(i)
    if (len(X_full[i])>sequence_length):
        sequence_length=len(X_full[i])
        
## Here we choose to set it to 56 because the only a few were above
sequence_length=56
print(sequence_length)


In [None]:
## PAD all the sequences to the same length (add 0 in the beginning of the words that are smaller, cut the longer)

X2_full = sequence.pad_sequences(X_full,maxlen=sequence_length)
X2= sequence.pad_sequences(X,maxlen=sequence_length)
X2_test= sequence.pad_sequences(X_test,maxlen=sequence_length)

In [None]:
# Model Hyperparameters
np.random.seed(2)
 
embedding_dim = 20          
filter_sizes = (3, 4)
num_filters = 3
dropout_prob = (0.7, 0.8)
hidden_dims = 100

# Training parameters
batch_size = 32
val_split = 0.1

In [None]:
## Create the model
# graph_in check the imput dimension of the 2 filter pool combination 
graph_in = Input(shape=(sequence_length, embedding_dim))
convs = []
## Create the filters + pools
for fsz in filter_sizes:
    conv = Convolution1D(nb_filter=num_filters,
                         filter_length=fsz,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(graph_in)
    pool = MaxPooling1D(pool_length=2)(conv)
    flatten = Flatten()(pool)
    convs.append(flatten)
    
if len(filter_sizes)>1:
    out = Merge(mode='concat')(convs)
else:
    out = convs[0]

graph = Model(input=graph_in, output=out)

# create the embedding layer, drop out, fliters + pools and dense layer finishes by a calssification with sigmoid
model = Sequential()
model.add(Embedding(len(vocabulary), embedding_dim, input_length=sequence_length,weights=embedding_weights))
model.add(Dropout(dropout_prob[0], input_shape=(sequence_length, embedding_dim)))
model.add(graph)
model.add(Dense(hidden_dims))
model.add(Dropout(dropout_prob[1]))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])



In [None]:
## Allow to easily save the weights
def save_model(model):
    # saving model
    json_model = model.to_json()
    open('model_architecture.json', 'w').write(json_model)
    # saving weights
    model.save_weights('model_weights.h5', overwrite=True) 

def load_model():
    # loading model
    model = model_from_json(open('model_architecture.json').read())
    model.load_weights('model_weights.h5')
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [None]:
## create the model Here we can choose the data to fit
# the model shows at the end of an epoch the split validation with 10% of untrained data
model.fit(X2, y, batch_size=batch_size,
nb_epoch=50, validation_split=val_split, verbose=1)
save_model(model)

In [None]:
loaded=load_model()

In [None]:
## Allow a second Validation (if not train with those data)
score = model.evaluate(X2_full, y_full, verbose=1)
print ("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

#### Rendu

In [None]:
# predict the result (with one and zeroes) 
y_pred = model.predict(X2_test)

In [None]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [None]:
y_rendu=[]
for i in range(len(y_pred)):
    if y_pred[i]>= 0.5:
        y_rendu.append(1)
    else: y_rendu.append(-1)
        
OUTPUT_PATH = 'prediction.csv' 
ids_test=[i+1 for i in range(len(y_rendu))]
create_csv_submission(ids_test, y_rendu, OUTPUT_PATH)