In [1]:
import tensorflow as tf
import numpy as np
import numpy
import json
import re
import spacy
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import pickle

In [36]:
def create_dataset(name: str , input_file: Path, output_dir: Path):
    """ extract json dataset to a specified path in .txt"""
    
    # create output dir
    dataset_dir = output_dir / name
    dataset_dir.mkdir(parents=True, exist_ok=True)
    
    # Loading raw json file
    with open(input_file, 'r', encoding='utf8') as file:   
        raw_dataset = json.load(file)
    
    # write files
    for idx, review in enumerate(tqdm(raw_dataset)):
        label_dir = dataset_dir / str(review['note'])
        label_dir.mkdir(parents=True, exist_ok=True)
        filename = label_dir / f"{review['review_id'].replace('review_', '')}.txt"
        filename.touch()
        with open(filename, 'w') as f:
            f.write(review['commentaire'])

In [4]:
output_dir = Path("../data/allocine")
train_json = Path("../data/json/train.json")
dev_json = Path("../data/json/dev.json")

In [37]:
create_dataset('dev', dev_json, output_dir)
create_dataset('train', train_json, output_dir)

100%|██████████| 100400/100400 [00:07<00:00, 13851.09it/s]
100%|██████████| 665962/665962 [00:50<00:00, 13315.78it/s]


In [6]:
batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "../data/allocine_filtered/train",
    label_mode='categorical',
    batch_size=batch_size
)
raw_dev_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "../data/allocine_filtered/eval",
    label_mode='categorical',
    batch_size=batch_size
)

Found 665947 files belonging to 10 classes.
Found 100399 files belonging to 10 classes.


In [7]:
print(
    "Number of batches in raw_train_ds: %d"
    % tf.data.experimental.cardinality(raw_train_ds)
)
print(
    "Number of batches in raw_val_ds: %d" 
    % tf.data.experimental.cardinality(raw_dev_ds)
)

Number of batches in raw_train_ds: 20811
Number of batches in raw_val_ds: 3138


In [198]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(5):
        print(text_batch.numpy()[i])
        print(label_batch.numpy()[i])

b'J\'ai pay\xc3\xa9 1 euro 50 ma place et c\'est tout aussi bien. Je n\'attendais vraiment rien de ce film, et je n\'ai pas \xc3\xa9t\xc3\xa9 d\xc3\xa9\xc3\xa7u....ce genre d\'humour qui, pour moi, ne peut faire rire que des gamins de 14/15 ans, une qualit\xc3\xa9 de jeu de la part des acteurs mauvaise, ou m\xc3\xaame proche du n\xc3\xa9ant par moments (trop fr\xc3\xa9quents malheureusement), des moments r\xc3\xa9ellement g\xc3\xaanants (toute derni\xc3\xa8re sc\xc3\xa8ne du film), des sc\xc3\xa8nes qui essaient d\'\xc3\xa9mouvoir mais qui se retrouve \xc3\xa0 imiter les plus gros nanars de l\'Histoire avec un grand H (prise de conscience path\xc3\xa9tique, discours nanardesques, etc...). Ce film se voit agr\xc3\xa9ment\xc3\xa9 de sc\xc3\xa8nes pr\xc3\xa9visibles (       spoiler:        On sait d\xc3\xa8s le d\xc3\xa9but du film que la vid\xc3\xa9o du mariage va \xc3\xaatre supprim\xc3\xa9e du t\xc3\xa9l\xc3\xa9phone     ). Le malaise se retrouve aussi dans le g\xc3\xa9n\xc3\xa9rique a

In [199]:
def custom_standardization(input_data):
    tokens = tf.strings.lower(input_data)
    tokens = tf.strings.regex_replace(tokens, '<br />', ' ')
    tokens = tf.strings.regex_replace(tokens, 'http\S+', '')
    tokens = tf.strings.regex_replace(tokens, '[^\d|(a-z)|!|#|@|è|é|à|ù|ô|ü|ë|ä|û|î|ê|â|ç\s]', ' ')
    tokens = tf.strings.regex_replace(tokens, '\s+', ' ')
    #tokens = tf.strings.regex_replace(tokens, '(.)\1+', ' ')
    #tokens = tf.strings.regex_replace(tokens, '[%s]' % re.escape(string.punctuation), '')
    return tokens

In [9]:
# Model constants.
max_features = 20000
embedding_dim = 300
sequence_length = 700

In [10]:
vectorize_layer = TextVectorization(
    standardize=None,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [11]:
text_ds = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [203]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
dev_ds = raw_dev_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
dev_ds = dev_ds.cache().prefetch(buffer_size=10)

In [208]:
from tensorflow.keras import layers

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(10, activation="softmax", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [210]:
epochs = 50

# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=dev_ds, epochs=epochs)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50

KeyboardInterrupt: 

In [211]:
# save model
model.save('../data/models/model_final')

# load model
# model = keras.models.load_model('path/to/location')

INFO:tensorflow:Assets written to: ../data/models/model_final/assets


In [230]:
# Pickle the config and weights
pickle.dump({'config': vectorize_layer.get_config(),
             'weights': vectorize_layer.get_weights()}
            , open("../data/models/model_final/tv_layer.pkl", "wb"))

In [231]:
from_disk = pickle.load(open("../data/models/model_final/tv_layer.pkl", "rb"))
new_v = TextVectorization.from_config(from_disk['config'])
# You have to call `adapt` with some dummy data (BUG in Keras)
#new_v.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
new_v.set_weights(from_disk['weights'])

In [233]:
text = 'é'
text = tf.expand_dims(text, -1)
#b = tf.strings.unicode_decode(x,'UTF-8')
print(vectorize_layer(text))
print(new_v(text))

tf.Tensor(
[[15586     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0  

We distribute pre-trained word vectors for 157 languages, trained on Common Crawl and Wikipedia using fastText. These models were trained using CBOW with position-weights, in dimension 300, with character n-grams of length 5, a window of size 5 and 10 negatives. We also distribute three new word analogy datasets, for French, Hindi and Polish.

In [3]:
path_to_glove_file = Path('../data/embedding/cc.fr.300.vec')

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 2000000 word vectors.


In [13]:
voc = vectorize_layer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))
num_tokens = len(voc) + 2
embedding_dim = 300
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 18485 words (1515 misses)


In [14]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

NameError: name 'Embedding' is not defined