# CamemBert for patent classification on IPCCAT-2018 data

First install the followings : 
- conda install -c conda-forge transformers
- conda install ipywidgets
- conda install pytorch

## Imports

In [4]:
import pandas as pd 
import pickle
import tensorflow as tf
import torch
import tensorflow.keras.backend as K
import transformers
from transformers import TFCamembertModel
from transformers import CamembertTokenizer
from transformers import TFCamembertForSequenceClassification
from ipywidgets import IntProgress
import numpy as np
"""Probably should try with TFCamemForSequenceClassification"""

'Probably should try with TFCamemForSequenceClassification'

In [5]:
# Defining some key variables that will be used later on in the training

MAX_LEN = 2000
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

## Load Data

In [3]:
with open ('../CNN-kim/Data_2labels_CIB4/df_test_clean.pickle','rb') as fichier:
    df_test = pickle.load(fichier)

with open ('../CNN-kim/Data_2labels_CIB4/df_train_clean.pickle','rb') as fichier:
    df = pickle.load(fichier)
    


In [4]:
train_size = 0.8
df_train = df.sample(frac=train_size, random_state=200)
df_dev = df.drop(df_train.index).reset_index(drop=True)
df_train = df_train.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(df_train.shape))
print("TEST Dataset: {}".format(df_dev.shape))

FULL Dataset: (3751492, 5)
TRAIN Dataset: (3001194, 5)
TEST Dataset: (750298, 5)


In [49]:
print(df_train.head())

  Patent_number CIB_1 CIB_2  \
0     FR3424527  G01R  G01R   
1     FR2503829  H04N  H04N   
2     FR3672822  B01D  B29C   
3     FR1905749  G06F  None   
4     FR3835148  G05B  None   

                                                text  cat  
0  circuit détection signal électromotrice hall, ...    2  
1  dispositif procédé traitement d'image l'invent...  426  
2  sac filtre poussière aspirateur, cordon soudé ...  169  
3  techniques gestion efficace puissance systèmes...  433  
4  dispositif diagnostic anormal, système diagnos...   91  


## Convert data to input type of Camembert


In [50]:
def convert_to_camembert_tfdataset(df):
    labels = list(df['cat'])
    
    input_ids = []
    attention_masks = []
    
    for text in df['text']:
        inputs = tokenizer.encode_plus(text,
                                     add_special_tokens=True,
                                     max_length=MAX_LEN,
                                     pad_to_max_length=True,
                                     return_token_type_ids=True,
                                     return_tensors="np",
                                     truncation=True,
                                     )
        
        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])
     
    input_ids = tf.convert_to_tensor(input_ids)
        
    attention_masks = tf.convert_to_tensor(attention_masks)
    labels = tf.convert_to_tensor(labels[:])    
        
    return (input_ids,attention_masks,labels)
                                             
        

In [51]:
train_input_ids, train_input_masks, train_labels = convert_to_camembert_tfdataset(df_train[:100000])
dev_input_ids, dev_input_masks, dev_labels = convert_to_camembert_tfdataset(df_dev[:100000])


In [52]:
def reshape_input(input_ids,input_masks):
    input_ids = tf.reshape( train_input_ids,(-1,MAX_LEN) )
    input_masks = tf.reshape( input_masks, (-1,MAX_LEN) )
    
    return input_ids, input_masks

def create_dataset(input_ids, input_masks, labels, batch_size):
    input_ids, input_masks = reshape_input(input_ids,input_masks)
    dataset= tf.data.Dataset.from_tensors(( (input_ids, input_masks), labels))
    dataset.batch(batch_size)
    
    return(dataset)


In [53]:
''' Batch size for training set initialization here'''
training_dataset = create_dataset(train_input_ids,
                                  train_input_masks,
                                  train_labels,
                                  batch_size=5) 

dev_dataset = create_dataset(dev_input_ids,
                            dev_input_masks,
                            dev_labels,
                            batch_size=8)

## Build model

### Model with TFCamembertModel

In [6]:
def build_model():
    ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=np.int32, name='input_id')
    mask = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=np.int32, name='input_mask')
    camembert_model = TFCamembertModel.from_pretrained('camembert-base', from_pt=True, trainable= False)
    
    camembert_layer = camembert_model([ids,mask])[0]
    flattened_layer = tf.keras.layers.Flatten()(camembert_layer)
    dropout_layer = tf.keras.layers.Dropout(0.2)(flattened_layer)
    
    dense_output_layer = tf.keras.layers.Dense(units=11,
                                              activation= 'softmax')(dropout_layer)
    model = tf.keras.models.Model(inputs=[ids,mask], outputs=dense_output_layer)
    
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss = loss, optimizer=optimizer, metrics=[metric])
    
    return(model)
    
    
    
    

In [None]:
model = build_model()
model.summary()

### Model with TFCamembertForSequenceClassification 

In [18]:
def build_model2():
    ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=np.int32, name='input_id')
    mask = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=np.int32, name='input_mask')
    camembert_model = TFCamembertForSequenceClassification.from_pretrained('camembert-base',
                                                                           from_pt=True,
                                                                           trainable= False,
                                                                           num_labels= 435)
    
    camembert_layer = camembert_model([ids,mask])[0]
    flattened_layer = tf.keras.layers.Flatten()(camembert_layer)
    dropout_layer = tf.keras.layers.Dropout(0.2)(flattened_layer)
    
    dense_output_layer = tf.keras.layers.Dense(units=435,
                                              activation= 'softmax')(dropout_layer)
    model = tf.keras.models.Model(inputs=[ids,mask], outputs=dense_output_layer)
    
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss = loss, optimizer=optimizer, metrics=[metric])
    
    return(model)

In [19]:
model = build_model2()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFCamembertForSequenceClassification: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
- This IS expected if you are initializing TFCamembertForSequenceClassification from a TF 2.0 model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a TFBertForPretraining model).
- This IS NOT expected if you are initializing TFCamembertForSequenceClassification from a TF 2.0 model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a TFBertForSequenceClassification model).
Some weights or buffers of the PyTorch model TFCamembertForSequenceClassification were not initialized from the TF 2.0 model and are newly initialized: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_he

In [20]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_id (InputLayer)           [(None, 2000)]       0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 2000)]       0                                            
__________________________________________________________________________________________________
tf_camembert_for_sequence_class ((None, 435),)       111547059   input_id[0][0]                   
                                                                 input_mask[0][0]                 
__________________________________________________________________________________________________
flatten_2 (Flatten)             (None, 435)          0           tf_camembert_for_sequence_c

## Fit function with whole dataset loaded in memory

In case of OOM, run the subsequent section "Fit function with generator".
You can initialize batch size differently for validation and training in the cells above.


In [57]:
model.fit(training_dataset, epochs=2, validation_data= dev_dataset, steps_per_epoch=10)

Train for 10 steps, validate for 1 steps
Epoch 1/2
 1/10 [==>...........................] - ETA: 3:31

ResourceExhaustedError:  OOM when allocating tensor with shape[100000,1536000] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[node model_1/dropout_77/dropout/random_uniform/RandomUniform (defined at <ipython-input-57-6a84d4cff052>:1) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_distributed_function_75425]

Function call stack:
distributed_function


## Fit function with generator in case of OOM.

- batch-size must be defined inside the generator
- do not forget to initialize steps_per_epoch attribute in fit function

> steps_per_epoch = len(df_train) // batch_size


In [58]:
def _input_fn():
    def generator():
        for ids, mask, l in zip(train_input_ids, train_input_masks, train_labels):
            yield {"input_id" : ids, "input_mask" : mask }, l
    
    dataset = tf.data.Dataset.from_generator(generator,
                                    output_types= ({"input_id" :tf.int32, "input_mask" : tf.int32}, tf.int32),                            
                                    output_shapes= ({"input_id" : tf.TensorShape([2000]), "input_mask" : tf.TensorShape([2000])}, tf.TensorShape([])),
                                            )
                            
                                                        
                                            
    dataset = dataset.batch(8)
    return(dataset)

In [60]:
train_input_ids, train_input_masks = reshape_input(train_input_ids, train_input_masks)

In [61]:
model.fit(_input_fn(), epochs=2, steps_per_epoch=4)

Train for 4 steps
Epoch 1/2

KeyboardInterrupt: 