In [5]:
import sys
import os
import pickle
import numpy as np
import pandas as pd
import random
import h5py
import csv
import json
import time
import string
import datetime as dt
import matplotlib.pyplot as plt
import re
random_state_number = 967898

In [16]:
import tensorflow as tf
print(tf.__version__)
import keras

2.1.0


Using TensorFlow backend.


In [17]:
tf.config.list_physical_devices('GPU')

[]

# Data

In [18]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


### Load Data from Pickle

In [26]:
with open ('./Data_2labels_CIB4/df_test.pickle','rb') as fichier:
    df_test = pickle.load(fichier)

with open ('./Data_2labels_CIB4/df_train.pickle','rb') as fichier:
    df_train = pickle.load(fichier)


In [27]:
print("Training data frame : ..." )
print(df_train.head())
print('\n')
print("Test data frame : ..." )
print(df_test.head())

Training data frame : ...
  Patent_number CIB_1 CIB_2                                               text
0     FR1706228  G06F  G06F   CREATION ET GESTION DE TRANSFORMATIONS DE VER...
1     FR1706229  G01N  None   dispositif d'etalonnage pour ethylometre L'in...
2      FR440980  H05B  None   CIRCUIT POUR EXCITER UNE INSTALLATION DE LAMP...
3      FR440981  G06F  H05K   CONNEXIONS POUR DES PLAQUETTES A CIRCUITS IMP...
4      FR440982  B05D  B05C   PROCEDE D'APPLICATION D'UNE COUCHE PROTECTRIC...


Test data frame : ...
  Patent_number CIB_1 CIB_2                                               text
0     FR1706257  G01T  A61B   TOMODENSITOMÈTRE À RÉSOLUTION VARIABLE ET À C...
1     FR2011487  C22C  C23C   MATÉRIAU COMPOSITE Matériau composite (1) com...
2     FR3804361  H04L  H04H   DISTRIBUTION EN INTÉRIEUR D'UN SIGNAL À LARGE...
3     FR3804376  G05D  G06Q   SYSTÈME DE DRONE DISTRIBUÉ ET DRONE Pour perm...
4      FR508245  H04N  H04N   PROCEDE D'IMAGERIE AMELIORE DESTINE A DES MAT...


In [35]:
print("There are {} patents for testing".format(len(df_test)))
print("Number of class represented in test Data for CIB_1 only :{}".format(df_test['CIB_1'].nunique()) + '\n')

print("There are {} patents for training ".format(len(df_train)))
print("Number of class represented in training Data for CIB_1 only :{}".format(df_train['CIB_1'].nunique()) )


There are 93103 patents for testing
Number of class represented in test Data for CIB_1 only :616

There are 4562049 patents for training 
Number of class represented in training Data for CIB_1 only :633


### Format Data

In [47]:
def seqAndPad(text,max_length, tokenizer):
    
    sequences = tokenizer.texts_to_sequences(text)
    del text
    data = pad_sequences(sequences, maxlen=max_length,
                         padding='post', truncating='post')
    del sequences
    return(data)

def truncate(text, max_length):
    for o, doc in enumerate(text):
        text[o]= " ".join(text[o].split()[:max_length])
    return(text)

def convertLabelsDict():
    CIBtoLabel={}
    id=0
    for entry in df_train['CIB_1']:
        if entry not in CIBtoLabel:
            CIBtoLabel[entry]=id
            id+=1
    for entry in df_test['CIB_1']:
        if entry not in CIBtoLabel:
            CIBtoLabel[entry]=id
            id+=1
    return(CIBtoLabel)

def formatTextData (df_train, df_test,max_length, max_num_words):
    '''
    max_length : tronquer les abstracts
    max_num_words : dans le tokenizer, si un mot est trop fréquent, il est supprimé
    '''
    text_train = df_train['text'].tolist()
    text_test = df_test['text'].tolist()
    
    print("Truncating text data to max_length")
    
    text_train = truncate(text_train[:200000], max_length)
    text_test = truncate(text_test, max_length)
    
    print("Tokenizing data...")
    tokenizer = Tokenizer(num_words=max_num_words)
    tokenizer.fit_on_texts(text_train)
    print("Tokenizing done")
    print("Sequencing and padding...")
    text_train = seqAndPad(text_train, max_length, tokenizer)
    text_test = seqAndPad(text_test, max_length, tokenizer)
    print("Sequencing and padding done")
    
    CIBtoLabel = convertLabelsDict()
    
    y_train = df_train['CIB_1'].tolist()[:200000]
    y_test = df_test['CIB_1'].tolist()
    
    for k in range(len(y_train)):
        y_train[k]= CIBtoLabel[y_train[k]]
    
    for k in range(len(y_test)):
        y_test[k]= CIBtoLabel[y_test[k]]
    
    y_train = tf.keras.utils.to_categorical(y_train, 633)
    y_test = tf.keras.utils.to_categorical(y_test, 633)
    
    return(text_train, y_train, text_test, y_test, tokenizer)
    



In [None]:
x_train, y_train, x_test, y_test, tokenizer = formatTextData(df_train, df_test, max_length = 500, max_num_words = 10000)

Truncating text data to max_length
Tokenizing data...


In [30]:
def load_data_and_labels(embedding_dim, max_length, max_num_words):
    texts_to_tokenize = readData('train_texts.pkl', max_length)
    print("Tokenizing data ..")
    tokenizer = Tokenizer(num_words=max_num_words)
    tokenizer.fit_on_texts(texts_to_tokenize)
    print("Tokenization done")
    print("Loading data and sequencing...")
    x_train, y_train = editData('train_texts.pkl', 'train_labels.pkl',
                                max_length, tokenizer)
    print("Training data loaded") ###Maintenant on fait le sequencing et le padding, qui sont dans la fonction editdata
    x_val, y_val = editData('test_texts.pkl', 'test_labels.pkl', 
                            max_length, tokenizer)
    print("Test Data loaded")
    word_index = tokenizer.word_index
    return x_train, y_train, x_val, y_val, tokenizer

In [31]:
WORD_EMB_SIZE = 200
MAX_TEXT_LEN = 500

In [33]:
print(y_train.shape)

90000
(90000, 614)


In [34]:
print(y_val.shape)

9989
(9989, 614)


# Embedding Layer

## Prepairing the Embedding layer

We compute an index mapping words to known pre-trained embeddings, by parsing the data dump of pre-trained embeddings

In [37]:
vocab_size=len(tokenizer.word_index) + 1
print('Vocab_size is {}'.format(vocab_size))

Vocab_size is 92098


In [38]:
embeddings_index = {}
with open('./Embeddings/glove.6B.200d.txt', 'r', encoding= "utf8") as f:
    for line in f:
        values = line.split()
        word=values[0]
        emb = np.asarray( values[1:], dtype='float32')
        embeddings_index[word] = emb
print('Found {} word vectors'.format(len(embeddings_index)))

Found 400000 word vectors


In [39]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, WORD_EMB_SIZE))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        #words not found in embedding index will be all-zeros
        embedding_matrix[i] = embedding_vector
print('Embedding_matrix loaded')
print('Shape {}'.format(embedding_matrix.shape))

Embedding_matrix loaded
Shape (92098, 200)


# Model 

In [49]:

import keras
import tensorflow as tf

from keras import backend as K

from keras.engine import Layer, InputSpec, InputLayer

from keras.models import Model, Sequential

from keras.layers import Dropout, Embedding, concatenate
from keras.layers import Conv1D, MaxPool1D, Conv2D, MaxPool2D, ZeroPadding1D
from keras.layers import Dense, Input, Flatten, BatchNormalization
from keras.layers import Concatenate, Dot, Concatenate, Multiply, RepeatVector
from keras.layers import Bidirectional, TimeDistributed
from keras.layers import SimpleRNN, LSTM, GRU, Lambda, Permute

from keras.layers.core import Reshape, Activation
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint,EarlyStopping,TensorBoard
from keras.constraints import maxnorm
from keras.regularizers import l2



### K-Max Pooling Layer

<a href=https://github.com/bicepjai/Deep-Survey-Text-Classification/blob/master/deep_models/paper_02_cnn_sent_model/utils.py> Lien ici </a>

et <a href=https://www.reddit.com/r/learnmachinelearning/comments/9hes2q/code_question_1d_convolution_layer_in_keras_with/> la </a>

In [43]:
from utils import KMaxPooling

### Folding layer


In [44]:
from utils import Folding

### Model definition

CNN with Dynamic k-Max Pooling with sentences

In [79]:
""" Parameters """
FILTERS = 128
pooling_units = 100
output_dims = 633

text_seq_input = Input(shape=(MAX_TEXT_LEN,), dtype='int32')
text_embedding = Embedding(vocab_size, WORD_EMB_SIZE, input_length=MAX_TEXT_LEN,
                            weights=[embedding_matrix], trainable=True)(text_seq_input)

filter_sizes = [3,4,5]
convs = []
for filter_size in filter_sizes:
    l_conv = Conv1D(filters=FILTERS, kernel_size=filter_size, padding='same', activation='relu')(text_embedding)
    POOL_SIZE = l_conv.get_shape()[-2] // pooling_units
    l_pool = MaxPool1D(pool_size=POOL_SIZE, padding='valid')(l_conv)   #Dynamic pooling
    convs.append(l_pool)

l_merge = Concatenate(axis=1)(convs)
l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
# since the text is too long we are maxpooling over 100
# and not GlobalMaxPool1D
l_pool1 = MaxPool1D(100)(l_cov1)
l_flat = Flatten()(l_pool1)
l_hidden = Dense(512, activation='relu')(l_flat)
l_hidden_drop = Dropout(0.5)(l_hidden)
l_out = Dense(output_dims, activation='sigmoid')(l_hidden_drop)  #dims output
model_1 = Model(inputs=[text_seq_input], outputs=l_out)

In [80]:
''' 
categorical cross-entropy : activation function en entrée c' est un softmax, i.e les scores se somment à 1 
alors que binary cross-entropy, c'est du softmax.
'''
model_1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['categorical_accuracy'])
model_1.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 1000)         0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 1000, 200)    18419600    input_8[0][0]                    
__________________________________________________________________________________________________
conv1d_25 (Conv1D)              (None, 1000, 128)    76928       embedding_8[0][0]                
__________________________________________________________________________________________________
conv1d_26 (Conv1D)              (None, 1000, 128)    102528      embedding_8[0][0]                
____________________________________________________________________________________________

# training

In [None]:
''' This callback logs events for TensorBoard, including:
log_dir : directory to save log file to be parsed by TensorBoard
histogram_freq : frequency (in epochs) at which to compute activation and weight histograms for the layers of the model. If set to 0, histograms won't be computed
write_graph : whether to visualize the graph in TensorBoard. The log file can become quite large when write_graph is set to True.
write_images : whether to write model weights to visualize as image in TensorBoard.
'''

tb_callback = tf.keras.callbacks.TensorBoard(
    log_dir='./tb_graphs', histogram_freq=0, write_graph=True, write_images=True)

In [50]:
checkpointer = ModelCheckpoint(filepath="model_1_weights.hdf5", 
                                    verbose=1,
                                    monitor="val_categorical_accuracy",
                                    save_best_only=True,
                                    mode="max")

In [None]:
''' Monitor : Quantity to be monitored
    min_delta : Minimum change in the monitored quantity to qualify as an improvement
    patience : Number of epochs with no improvement after which training will be stopped
    mode :One of {"auto", "min", "max"}
    '''
earlystopping = EarlyStopping(monitor='val_categorical_accuracy', 
                              min_delta=0, patience=5, 
                              verbose=0, mode='auto')

In [51]:

try:
    model_1.load_weights("model_11_weights.hdf5")
except IOError as ioe:
    print("no checkpoints available !")
    
model_1.fit(x_train, y_train, 
          validation_data=(x_val, y_val),
          epochs=10, batch_size=32,shuffle=True,
          callbacks=[checkpointer])

no checkpoints available !
Train on 90000 samples, validate on 9989 samples
Epoch 1/10
  800/90000 [..............................] - ETA: 39:46 - loss: 6.0502 - categorical_accuracy: 0.0613

KeyboardInterrupt: 