In [None]:
import keras
from keras.models import Sequential
from keras import backend as K
from keras.layers import Input, Embedding, Dropout, Conv1D, Conv2D, MaxPooling1D, Dense, Merge, Flatten
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.merge import concatenate
from sklearn.metrics import f1_score
from keras.regularizers import l2
import pandas as pd
from sklearn import preprocessing
import numpy as np

In [None]:
my_optimizer = 'adam'
N_VECTORIZATION = 15

In [None]:
path_to_data = "data/"

training_set = pd.read_csv(path_to_data+"improved_training_set.csv")
testing_set = pd.read_csv(path_to_data+"improved_testing_set.csv")

## Preparation of the data

In [None]:
selected_features = ["description_d2v_"+str(i)+"_source" for i in range(N_VECTORIZATION)]+["description_d2v_"+str(i)+"_target" for i in range(N_VECTORIZATION)]
selected_features += ["title_d2v_"+str(i)+"_source" for i in range(N_VECTORIZATION)]+["title_d2v_"+str(i)+"_target" for i in range(N_VECTORIZATION)]
selected_features += ["common_neighbor",
                     "same_cluster",
                     "jaccard",
                     "diff_in_bc",
                     "diff_in_inlinks",
                     "diff_in_year",
                     "author_nb_common",
                     "author_is_one_common",
                     "common_classification",
                     "title_is_one_common",
                     "title_nb_common_word",
                     "cos_similarity_title",
                     "cos_similarity_description",
                     "target_eccentricty",
                     'inlinks_target',
                     'betweenness_author_target',
                     'inlinks_author_target',
                     'cos_similarity_tf_title',
                     'cos_similarity_tf_description',
                    ]

In [None]:
training_set = training_set.fillna(0)
training_set[selected_features_global] = preprocessing.scale(training_set[selected_features])

testing_set = testing_set.fillna(0)
testing_set[selected_features_global] = preprocessing.scale(testing_set[selected_features])

In [None]:
X_tr = training_set[selected_features_global].as_matrix()[10000:]
Y_tr = training_set["label"].as_matrix()[10000:]

X_val = training_set[selected_features_global].as_matrix()[:10000]
Y_val = training_set["label"].as_matrix()[:10000]

X_tst = testing_set[selected_features_global].as_matrix()

del(training_set, testing_set)

## Build Neural Network

In [None]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))

In [None]:
model = Sequential()

model.add(Dense(1500, activation = "relu", input_dim=len(selected_features_global)))
model.add(Dropout(0.5))

model.add(Dense(1500, activation = "relu"))
model.add(Dropout(0.5))

model.add(Dense(1, activation = "sigmoid"))

In [None]:
model.compile(optimizer=my_optimizer,
              loss='binary_crossentropy',
              metrics=[f1])
model.summary()

## Classification

In [None]:
checkpoint = ModelCheckpoint("modelsNN/weights_1500_1500.{epoch:02d}-{f1:.4f}-{val_f1:.4f}.hdf5", monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
model.fit(X_tr,Y_tr,batch_size=128,epochs=50,verbose=1,validation_data=(X_val, Y_val,), callbacks=[checkpoint])

In [None]:
predicted = model.predict(X_tst)
predicted = [int(i>0.5) for i in predicted]

In [None]:
df_sub = pd.DataFrame(predicted,columns=["category"])

df_sub.to_csv('output_NN.csv', float_format='%.6f', index_label="ID")
print("done")