In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress INFO and WARNING messages
import argparse
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import joblib
from sentence_transformers import SentenceTransformer

## train test split
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Parameters
n_missing = 3
MODEL_PATH = "rnn_model.h5"
TOKENIZER_PATH = "tokenizer.pkl"
LABEL_ENCODER_PATH = "label_encoder.pkl"
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 128

In [9]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'train.csv', 'validation': 'dev.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/ibm-research/argument_quality_ranking_30k/" + splits["train"])

In [10]:
df.head()

Unnamed: 0,argument,topic,set,WA,MACE-P,stance_WA,stance_WA_conf
0,"""marriage"" isn't keeping up with the times. a...",We should abandon marriage,train,0.846165,0.297659,1,1.0
1,.a multi-party system would be too confusing a...,We should adopt a multi-party system,train,0.891271,0.726133,-1,1.0
2,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,train,0.730395,0.225212,-1,1.0
3,"100% agree, should they do that, it would be a...",We should abolish safe spaces,train,0.236686,0.004104,1,0.805517
4,A ban on naturopathy creates a cohesive front ...,We should ban naturopathy,train,0.753805,0.337724,1,1.0


In [13]:
argument = list(df.argument)
topic = list(df.topic)
note = list(df.WA)

In [14]:
data = []
for i in range(len(argument)):
    data.append((argument[i] + topic[i]))

In [15]:
from transformers import AutoTokenizer, AutoModel
import torch
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, LSTM

# Charger le tokenizer et le modèle BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
bert_model = AutoModel.from_pretrained("bert-base-multilingual-cased")

In [16]:
from tqdm import tqdm

In [17]:
len(data)

20974

In [22]:
print(list([1,2,3]))

[1, 2, 3]


In [27]:
# Exemple de données
texts = data[:2000]

batch_size = 249  # Ajustez cette valeur en fonction de votre RAM

all_bert_embeddings = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i : i + batch_size]
    encoded_inputs = tokenizer(
        batch_texts,
        padding="max_length",  # Forcer le padding à une longueur fixe
        truncation=True,
        max_length=MAX_SEQUENCE_LENGTH,
        return_tensors="pt"
    )
    
    with torch.no_grad():
        outputs = bert_model(**encoded_inputs)
        batch_embeddings = outputs.last_hidden_state  # Shape: (batch_size, max_length, hidden_size)
    
    all_bert_embeddings.append(batch_embeddings.numpy())

print(all_bert_embeddings[0].shape)
# Concaténer tous les embeddings en un seul tableau numpy
bert_embeddings = np.concatenate(all_bert_embeddings, axis=0)

# Entraîner le modèle avec des données d'entraînement adaptées
# model.fit(bert_embeddings_train, y_train, validation_data=(bert_embeddings_test, y_test), epochs=10, batch_size=32)

  0%|          | 0/9 [00:00<?, ?it/s]

100%|██████████| 9/9 [02:16<00:00, 15.21s/it]

(249, 100, 768)





In [28]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, np.array(list(note)[:2000]), test_size=0.2, random_state=42)

In [29]:
# Construire le modèle LSTM avec les embeddings de BERT
model = Sequential()

# Ajouter des couches bidirectionnelles LSTM
model.add(Bidirectional(LSTM(128, return_sequences=True, input_shape=(MAX_SEQUENCE_LENGTH, bert_embeddings.shape[-1]))))
model.add(Bidirectional(LSTM(64)))

# Ajouter des couches denses
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compiler le modèle
model.compile(loss='mean_absolute_error', optimizer='adam')

In [30]:
# Train and evaluate model with train and test
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7d1c86050700>

In [32]:
def predict(data):
    data = [data]
    # Tokenisation et encodage des textes
    encoded_inputs = tokenizer(data, padding=True, truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="pt")
    # Obtenir les embeddings de BERT
    with torch.no_grad():
        outputs = bert_model(**encoded_inputs)
        # Utiliser les embeddings de la dernière couche cachée
        bert_embeddings = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

    # Convertir les embeddings en numpy pour Keras
    bert_embeddings = bert_embeddings.numpy()
    print(bert_embeddings.shape)
    prediction = model.predict(bert_embeddings)
    return prediction

In [None]:
arg = "a zero tolerance policy means that parents would give complete control of discipline to the school without any regard for family morals and teachings."
top = "We should adopt a zero-tolerance policy in schools"
predict(arg + top)

(1, 22, 768)


array([[0.7031841]], dtype=float32)