In [1]:
import sys, os
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import  matplotlib.pyplot as plt

from tokenizers import BertWordPieceTokenizer

In [2]:
def build_model(transformer, max_len=512):

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=6e-6), loss='binary_crossentropy', metrics=['accuracy','AUC'])
    
    return model

In [3]:
MAX_LEN = 192

transformer_layer = (
            transformers.TFDistilBertModel
            .from_pretrained('distilbert-base-multilingual-cased')
        )

model = build_model(transformer_layer, max_len=MAX_LEN)
model.load_weights('model/weights')

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-multilingual-cased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f5c842cb580>

In [4]:
list_of_texts = ['j aime bien','khedma k zebi','service tahfoun']

In [5]:
import pandas as pd
test = pd.read_csv('/home/aziz/vneuron/scrapping/googlemaps-scraper/data/scrap_test.csv')

In [6]:
BATCH_SIZE = 32
def create_test(x_test) :
    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(x_test)
        .batch(BATCH_SIZE)
    )
    return test_dataset

In [7]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):

    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [8]:
import numpy as np
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
tokenizer.save_pretrained('.')
    # Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)


fast_tokenizer.enable_truncation(max_length=MAX_LEN)
fast_tokenizer.enable_padding(length=MAX_LEN)

In [9]:
test_texts = fast_encode(test.caption.values.astype(str), fast_tokenizer, maxlen=MAX_LEN)

test_ys = test.intent.values

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [10]:
results = model.predict(create_test(test_texts))
for i,result in enumerate(results) :
    if result > 0.5 :
        results[i] = 1
    else :
        results[i] = 0

In [11]:
confusion_matrix(test_ys, results)

array([[ 6,  4],
       [28, 63]])

In [12]:
print(classification_report(test_ys, results))

              precision    recall  f1-score   support

           0       0.18      0.60      0.27        10
           1       0.94      0.69      0.80        91

    accuracy                           0.68       101
   macro avg       0.56      0.65      0.54       101
weighted avg       0.86      0.68      0.75       101



In [13]:
for i in range(len(test_ys)) :
    if test_ys[i] != results[i] :
        print(test['caption'][i])

 شركة رائعة! ترحب استثنائية! انتم الافضل   Superbe société ! accueille extraordinaire ! vous êtes les meilleurs
 شركة رائعة! ترحب استثنائية! انتم الافضل   Superbe société ! accueille extraordinaire ! vous êtes les meilleurs
 شركة رائعة! ترحب استثنائية! انتم الافضل   Superbe société ! accueille extraordinaire ! vous êtes les meilleurs
 Supebe شركة غير عادية المنزل! انتم الافضل   Supebe société accueil extraordinaire!!! vous êtes les meilleurs
 ترحاب للغاية ، اختيار جيد من الأطباق. بلح البحر جيدة جدا!   Très bon accueil, un bon choix de plats. Très bonne moules!
ممتاااااااااز ونظيف ولذيذ وسعر رخيص جدا جدا جدا
 لزيارة ولكن الخدمة التي سيتم ترقيتها   A visiter mais service a remettre à niveau
 مرحاض قذر   Toilette sale
 جيد   Good
 المكان المناسب إذا كنت تحب المأكولات البحرية ، ولا سيما بلح البحر! انهم يعرفون فقط كيفية صنعها! :) الداخلية / الديكور هي أيضا جميلة وفريدة من نوعها. يمكنك إجراء حجز أو مجرد الذهاب إلى المكان. أنني حجزت طاولة مع زملائي وأحببناها جميعًا. الموظفين أيضا الودية والرع