In [1]:
import sys
import os
import pickle
import numpy as np
import pandas as pd
import random
import h5py
import csv
import json
import time
import string
import datetime as dt
import matplotlib.pyplot as plt
import re
random_state_number = 967898

In [2]:
import tensorflow as tf
print(tf.__version__)
import keras


2.2.0


Using TensorFlow backend.


In [3]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  3


In [9]:
##Check if it runs on GPU

# Create some tensors
# Place tensors on default config (hopefully GPU)
a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
c = tf.matmul(a, b)

print(c)

tf.Tensor(
[[22. 28.]
 [49. 64.]], shape=(2, 2), dtype=float32)


# Data

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

### Load Data from Pickle

In [5]:
with open ('./Data_2labels_CIB4/df_test_clean.pickle','rb') as fichier:
    df_test = pickle.load(fichier)

with open ('./Data_2labels_CIB4/df_train_clean.pickle','rb') as fichier:
    df_train = pickle.load(fichier)


In [6]:
print("Training data frame : ..." )
print(df_train['text'].head())
print('\n')
print("Test data frame : ..." )
print(df_test['text'].head())

Training data frame : ...
2    circuit exciter installation lampes a decharge...
4    procede d'application d'une couche protectrice...
5    prise controle circuits integres prise contrôl...
6    dispositif formant ecran electromagnetique dis...
7    ballast electronique lampe luminescente a gaz ...
Name: text, dtype: object


Test data frame : ...
0    tomodensitomètre résolution variable capacité ...
1    matériau composite matériau composite (1) comp...
2    distribution intérieur d'un signal large bande...
3    système drone distribué drone permettre d'obte...
4    procede d'imagerie ameliore destine a matieres...
Name: text, dtype: object


In [7]:
print("There are {} patents for testing".format(len(df_test)))
print("Number of class represented in test Data for CIB_1 only :{}".format(df_test['CIB_1'][:100000].nunique()) + '\n')

print("There are {} patents for training ".format(len(df_train)))
print("Number of class represented in training Data for CIB_1 only :{}".format(df_train['CIB_1'][:1000000].nunique()) )


There are 91519 patents for testing
Number of class represented in test Data for CIB_1 only :435

There are 3751492 patents for training 
Number of class represented in training Data for CIB_1 only :425


### Format Data

In [30]:
def seqAndPad(text,max_length, tokenizer):
    
    sequences = tokenizer.texts_to_sequences(text)
    del text
    data = pad_sequences(sequences, maxlen=max_length,
                         padding='post', truncating='post')
    del sequences
    return(data)

def truncate(text, max_length):
    for o, doc in enumerate(text):
        text[o]= " ".join(text[o].split()[:max_length])
    return(text)

def convertLabelsDict():
    CIBtoLabel={}
    id=0
    for entry in df_train['CIB_1']:
        if entry not in CIBtoLabel:
            CIBtoLabel[entry]=id
            id+=1
    for entry in df_test['CIB_1']:
        if entry not in CIBtoLabel:
            CIBtoLabel[entry]=id
            id+=1
    return(CIBtoLabel)

def formatTextData (df_train, df_test,max_length, max_num_words):
    '''
    max_length : tronquer les abstracts
    max_num_words : dans le tokenizer, si un mot est trop fréquent, il est supprimé
    '''
    text_train = df_train['text'].tolist()
    text_test = df_test['text'].tolist()
    
    print("Truncating text data to max_length")
    
    text_train = truncate(text_train[:1500000], max_length)
    text_test = truncate(text_test, max_length)
    
    print("Tokenizing data...")
    tokenizer = Tokenizer(num_words=max_num_words)
    tokenizer.fit_on_texts(text_train)
    print("Tokenizing done")
    print("Sequencing and padding...")
    text_train = seqAndPad(text_train, max_length, tokenizer)
    text_test = seqAndPad(text_test, max_length, tokenizer)
    print("Sequencing and padding done")
    
    CIBtoLabel = convertLabelsDict()
    
    y_train = df_train['CIB_1'].tolist()[:1500000]
    y_test = df_test['CIB_1'].tolist()
    
    for k in range(len(y_train)):
        y_train[k]= CIBtoLabel[y_train[k]]
    
    for k in range(len(y_test)):
        y_test[k]= CIBtoLabel[y_test[k]]
    
    y_train = tf.keras.utils.to_categorical(y_train, 435)
    y_test = tf.keras.utils.to_categorical(y_test, 435)
    
    return(text_train, y_train, text_test, y_test, tokenizer)
    



In [31]:
x_train, y_train, x_test, y_test, tokenizer = formatTextData(df_train,
                                                             df_test, 
                                                             max_length = 1000,
                                                             max_num_words = 50000)

Truncating text data to max_length
Tokenizing data...
Tokenizing done
Sequencing and padding...
Sequencing and padding done


In [32]:
WORD_EMB_SIZE = 300
MAX_TEXT_LEN = 1000

In [33]:
print(y_train.shape)

(1500000, 435)


In [34]:
print(y_test.shape)

(91519, 435)


In [20]:
# from sklearn.externals import joblib
# with open (os.path.join('./Data_2labels_CIB4/', 'x_train_y.sav') , 'wb') as save:
#    joblib.dump( (x_train,y_train) , save)
# with open (os.path.join('./Data_2labels_CIB4/', 'x_test_y.sav') , 'wb') as save:
#    joblib.dump( (x_test,y_test) , save)



# Embedding Layer

## Prepairing the Embedding layer

We compute an index mapping words to known pre-trained embeddings, by parsing the data dump of pre-trained embeddings

In [35]:
vocab_size=len(tokenizer.word_index) + 1
print('Vocab_size is {}'.format(vocab_size))

Vocab_size is 403676


In [36]:
import gzip
import codecs
embeddings_index = {}
with codecs.getreader("utf-8")(gzip.open('./Embeddings/cc.fr.300.vec.gz', 'rb')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        emb = np.asarray( values[1:], dtype='float32')
        embeddings_index[word] = emb
print('Found {} word vectors'.format(len(embeddings_index)))

Found 2000000 word vectors


In [37]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, WORD_EMB_SIZE))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        #words not found in embedding index will be all-zeros
        embedding_matrix[i] = embedding_vector
print('Embedding_matrix loaded')
print('Shape {}'.format(embedding_matrix.shape))

Embedding_matrix loaded
Shape (403676, 300)


In [69]:
#Case of OOM
embedding_matrix = embedding_matrix[:,:200]
print('Shape {}'.format(embedding_matrix.shape))

Shape (318134, 200)


# Model 

In [21]:

import keras
import tensorflow as tf

from keras import backend as K

from keras.engine import Layer, InputSpec, InputLayer

from keras.models import Model, Sequential

from keras.layers import Dropout, Embedding, concatenate
from keras.layers import Conv1D, MaxPool1D, Conv2D, MaxPool2D, ZeroPadding1D
from keras.layers import Dense, Input, Flatten, BatchNormalization
from keras.layers import Concatenate, Dot, Concatenate, Multiply, RepeatVector
from keras.layers import Bidirectional, TimeDistributed
from keras.layers import SimpleRNN, LSTM, GRU, Lambda, Permute

from keras.layers.core import Reshape, Activation
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint,EarlyStopping,TensorBoard
from keras.constraints import maxnorm
from keras.regularizers import l2



In [38]:
from tensorflow.keras import backend as K

#from tensorflow.keras.engine import Layer, InputSpec, InputLayer

from tensorflow.keras.models import Model, Sequential

from tensorflow.keras.layers import Dropout, Embedding, concatenate
from tensorflow.keras.layers import Conv1D, MaxPool1D, Conv2D, MaxPool2D, ZeroPadding1D
from tensorflow.keras.layers import Dense, Input, Flatten, BatchNormalization
from tensorflow.keras.layers import Concatenate, Dot, Concatenate, Multiply, RepeatVector
from tensorflow.keras.layers import Bidirectional, TimeDistributed
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Lambda, Permute

#from tensorflow.keras.layers.core import Reshape, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
#from tensorflow.keras.constraints import maxnorm
from tensorflow.keras.regularizers import l2

### K-Max Pooling Layer

<a href=https://github.com/bicepjai/Deep-Survey-Text-Classification/blob/master/deep_models/paper_02_cnn_sent_model/utils.py> Lien ici </a>

et <a href=https://www.reddit.com/r/learnmachinelearning/comments/9hes2q/code_question_1d_convolution_layer_in_keras_with/> la </a>

In [73]:
from utils import KMaxPooling

### Folding layer


In [74]:
from utils import Folding

### Model definition

CNN with Dynamic k-Max Pooling with sentences

In [39]:
'''Grid Search Parameters'''

from tensorboard.plugins.hparams import api as hp
HP_FILTERS = hp.HParam('number_of_filter_channel', hp.Discrete([64,128,256]))
HP_POOLING_UNITS = hp.HParam('pooling_units', hp.Discrete([100,500,1000]))
HP_HIDDEN_DIMS = hp.HParam('hidden_dims', hp.Discrete([32,64,128]))
#HP_EMBEDDING_TRAINABLE = hp.HParam('embedding_trainable' , hp.Discrete(['True', 'False'])) # On ne va pas s'en occuper dans un premier temps
                                                                                                # Selon la publication, trainable = False
METRIC_CAT_ACCURACY = 'categorical_accuracy'

In [40]:
with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
    hp.hparams_config(
    hparams=[HP_FILTERS, HP_POOLING_UNITS, HP_HIDDEN_DIMS],
    metrics=[hp.Metric(METRIC_CAT_ACCURACY, display_name='Categorical_accuracy')],
    )

In [41]:
""" Parameters 
FILTERS = 128
pooling_units = 100  #le nombre de features qu'on garde par pooling, i.e après pooling, chaque vecteur est de longueur 100
hidden_dims= 32"""

def create_model(hparams, output_dims = 435, Embedding_trainable = False, MAX_TEXT_LEN = 1000):

    text_seq_input = Input(shape=(MAX_TEXT_LEN,), dtype='int32')
    text_embedding = Embedding(vocab_size, WORD_EMB_SIZE, input_length=MAX_TEXT_LEN,
                                weights=[embedding_matrix], trainable=Embedding_trainable)(text_seq_input)
    text_dropout = Dropout(0.25)(text_embedding)

    filter_sizes = [2,3,4,8,10]
    convs = []
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=hparams[HP_FILTERS], kernel_size=filter_size, padding='same', activation='relu')(text_dropout)
        POOL_SIZE = l_conv.get_shape()[-2] // hparams[HP_POOLING_UNITS]
        l_pool = MaxPool1D(pool_size=POOL_SIZE, strides =2, padding='valid')(l_conv)   #Dynamic pooling
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)
    l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
    # since the text is too long we are maxooling over 100
    # and not GlobalMaxPool1D
    l_pool1 = MaxPool1D(100)(l_cov1)
    l_flat = Flatten()(l_pool1)
    #l_flat = Flatten()(l_merge)
    l_hidden = Dense(hparams[HP_HIDDEN_DIMS], activation ='relu')(l_flat)
    l_hidden_drop = Dropout(0.5)(l_hidden)
    l_out = Dense(output_dims, activation='softmax')(l_hidden_drop)  #dims output
    model_1 = Model(inputs=[text_seq_input], outputs=l_out)
    
    return(model_1)

In [31]:
''' 
categorical cross-entropy : activation function en entrée c' est un softmax, i.e les scores se somment à 1 
alors que binary cross-entropy, c'est du sigmoid.
'''
model_1.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['categorical_accuracy'])
model_1.summary()

NameError: name 'model_1' is not defined

# training

In [32]:
''' This callback logs events for TensorBoard, including:
log_dir : directory to save log file to be parsed by TensorBoard
histogram_freq : frequency (in epochs) at which to compute activation and weight histograms for the layers of the model. If set to 0, histograms won't be computed
write_graph : whether to visualize the graph in TensorBoard. The log file can become quite large when write_graph is set to True.
write_images : whether to write model weights to visualize as image in TensorBoard.
'''

tb_callback = tf.keras.callbacks.TensorBoard(
    log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True, profile_batch = 100000000)

In [34]:
hp_callback = hp.KerasCallback( hparams, log_dir = './logs/hparam_tuning' )

NameError: name 'hparams' is not defined

In [44]:
checkpointer = ModelCheckpoint(filepath="model_1_weights.hdf5", 
                                    verbose=1,
                                    monitor="val_categorical_accuracy",
                                    save_best_only=True,
                                    mode="max")

In [79]:
''' Monitor : Quantity to be monitored
    min_delta : Minimum change in the monitored quantity to qualify as an improvement
    patience : Number of epochs with no improvement after which training will be stopped
    mode :One of {"auto", "min", "max"}
    '''
earlystopping = EarlyStopping(monitor='val_categorical_accuracy', 
                              min_delta=0, patience=5, 
                              verbose=0, mode='auto')

In [42]:
def get_dataset(x_train, y_train, x_test, y_test):
    batch_size = 96
    return (
        tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size),
        tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size),
    )

In [43]:
train_dataset, dev_dataset = get_dataset(x_train, y_train, x_test, y_test)


In [44]:
def run(run_dir, hparams):
    model_1 = create_model(hparams, output_dims = 435, Embedding_trainable = False, MAX_TEXT_LEN = 1000)
    model_1.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate= 0.015), metrics=['categorical_accuracy'])
    tb_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs',
                                                 histogram_freq=0, 
                                                 write_graph=True, 
                                                 profile_batch = 100000000)
    hp_callback = hp.KerasCallback(run_dir,  hparams )
    tbde_callback = tf.keras.callbacks.TensorBoard(run_dir)
    model_1.fit(train_dataset,
                validation_data = dev_dataset, 
                epochs=10, batch_size=96,shuffle=True,
                callbacks=[tb_callback, hp_callback, tbde_callback])

In [29]:
from tensorflow.tensorboard.tensorboard import main
%load_ext tensorboard 

ModuleNotFoundError: No module named 'tensorflow.tensorboard'

In [45]:
!rm -rf ./logs/

In [None]:
strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

with strategy.scope():
    
    session_num = 0
    for filters in HP_FILTERS.domain.values:
        for pooling_units in HP_POOLING_UNITS.domain.values:
            for hidden_dims in HP_HIDDEN_DIMS.domain.values:
                hparams = {
                    HP_FILTERS : filters,
                    HP_POOLING_UNITS : pooling_units,
                    HP_HIDDEN_DIMS : hidden_dims,
                }
                run_name = "run-%d" % session_num
                print('--- Starting trial: %s' % run_name)
                print({h.name: hparams[h] for h in hparams})
                run('logs/hparam_tuning/' + run_name, hparams)
                session_num += 1

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2')
Number of devices: 3
--- Starting trial: run-0
{'number_of_filter_channel': 64, 'pooling_units': 100, 'hidden_dims': 32}
Epoch 1/10
INFO:tensorflow:batch_all_reduce: 16 all-reduces with algorithm = hierarchical_copy, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorf

In [50]:

try:
    model_1.load_weights("model_11_weights.hdf5")
except IOError as ioe:
    print("no checkpoints available !")
    
model_1.fit(x_train, y_train, 
          validation_data=(x_test, y_test),
          epochs=10, batch_size=128,shuffle=True,
          callbacks=[checkpointer, tb_callback])

no checkpoints available !
Epoch 1/10
Epoch 00001: val_categorical_accuracy improved from 0.06942 to 0.08681, saving model to model_1_weights.hdf5
Epoch 2/10
Epoch 00002: val_categorical_accuracy improved from 0.08681 to 0.13789, saving model to model_1_weights.hdf5
Epoch 3/10
Epoch 00003: val_categorical_accuracy improved from 0.13789 to 0.17176, saving model to model_1_weights.hdf5
Epoch 4/10
Epoch 00004: val_categorical_accuracy improved from 0.17176 to 0.20477, saving model to model_1_weights.hdf5
Epoch 5/10
Epoch 00005: val_categorical_accuracy improved from 0.20477 to 0.22909, saving model to model_1_weights.hdf5
Epoch 6/10
Epoch 00006: val_categorical_accuracy improved from 0.22909 to 0.24758, saving model to model_1_weights.hdf5
Epoch 7/10
Epoch 00007: val_categorical_accuracy improved from 0.24758 to 0.25365, saving model to model_1_weights.hdf5
Epoch 8/10
Epoch 00008: val_categorical_accuracy improved from 0.25365 to 0.26015, saving model to model_1_weights.hdf5
Epoch 9/10
Ep

<tensorflow.python.keras.callbacks.History at 0x7fd2b05298d0>

In [39]:
%tensorboard --logdir logs/fit

UsageError: Line magic function `%tensorboard` not found.


In [30]:
from keras.models import load_model
model = load_model('model_1_weights.hdf5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [45]:
import sklearn
from sklearn.metrics import confusion_matrix
predictions = model.predict(x_test)
y_pred = (predictions > 0.5)
matrix = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
print(matrix)

[[2985    9    0 ...    0    0    0]
 [1222  747    0 ...    0    0    0]
 [ 224    0  153 ...    0    0    0]
 ...
 [   1    0    0 ...    0    0    0]
 [   1    0    0 ...    0    0    0]
 [   1    0    0 ...    0    0    0]]


In [39]:
from confusionmatrix import ConfusionMatrix

In [34]:
cib_list = convertLabelsDict()
cib_list = list(cib_list.keys())

In [40]:
cm = ConfusionMatrix(x = x_test, y = y_test, model = model_1)
cm.plot_confusion_matrix(cib_list)

NameError: name 'x_test' is not defined