In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress INFO and WARNING messages
import argparse
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import joblib
from sentence_transformers import SentenceTransformer

## train test split
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Parameters
n_missing = 3
MODEL_PATH = "rnn_model.h5"
TOKENIZER_PATH = "tokenizer.pkl"
LABEL_ENCODER_PATH = "label_encoder.pkl"
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 128

In [3]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'train.csv', 'validation': 'dev.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/ibm-research/argument_quality_ranking_30k/" + splits["train"])

In [4]:
df.head()

Unnamed: 0,argument,topic,set,WA,MACE-P,stance_WA,stance_WA_conf
0,"""marriage"" isn't keeping up with the times. a...",We should abandon marriage,train,0.846165,0.297659,1,1.0
1,.a multi-party system would be too confusing a...,We should adopt a multi-party system,train,0.891271,0.726133,-1,1.0
2,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,train,0.730395,0.225212,-1,1.0
3,"100% agree, should they do that, it would be a...",We should abolish safe spaces,train,0.236686,0.004104,1,0.805517
4,A ban on naturopathy creates a cohesive front ...,We should ban naturopathy,train,0.753805,0.337724,1,1.0


In [5]:
df = df

In [6]:
argument = list(df.argument)
topic = list(df.topic)
note = list(df.WA)

In [77]:
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
PADDINGS_TOPICS = 10
PADDINGS_ARGUMENTS = 100

nltk.download('punkt')

data = []
for i in range(len(argument)):
    arg_token = word_tokenize(argument[i].lower())
    arg_topic = word_tokenize(topic[i].lower())
    if len(arg_topic) < PADDINGS_TOPICS:
        arg_topic += ['<PAD>'] * (PADDINGS_TOPICS - len(arg_topic))
    if len(arg_token) < PADDINGS_ARGUMENTS:
        arg_token += ['<PAD>'] * (PADDINGS_ARGUMENTS - len(arg_token))

    data.append(arg_topic + arg_token)

model_embedding = Word2Vec(sentences=data, vector_size=100, window=5, min_count=1, workers=4)

print("Exemple de vocabulaire :", list(model_embedding.wv.key_to_index.keys())[:10])

[nltk_data] Downloading package punkt to /home/dimitri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Exemple de vocabulaire : ['<PAD>', 'should', 'we', 'the', 'to', '.', 'and', 'of', 'a', 'be']


In [96]:
import numpy as np

def sentence_to_vec(sentence, model):
    vectors = []
    for word in sentence:
        if word in model.wv:
            vectors.append(model.wv[word])
        else:
            vectors.append(np.zeros(model.vector_size))  # Use zero vector for unknown words
    return np.array(vectors)

In [79]:
X = np.array([sentence_to_vec(sent, model_embedding) for sent in data])

In [80]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, np.array(list(note)), test_size=0.2, random_state=42)

In [81]:
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, LSTM
# Construire le modèle LSTM avec les embeddings de BERT
model = Sequential()

# Ajouter des couches bidirectionnelles LSTM
model.add(Bidirectional(LSTM(128, return_sequences=True, input_shape=(MAX_SEQUENCE_LENGTH, X.shape[-1]))))
model.add(Bidirectional(LSTM(64)))

# Ajouter des couches denses
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compiler le modèle
model.compile(loss='mean_absolute_error', optimizer='adam')

In [83]:
# Train and evaluate model with train and test
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=32)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7249a5b7ac50>

In [94]:
def prepare_data(topic, arg):
    arg_token = word_tokenize(arg.lower())
    arg_topic = word_tokenize(topic.lower())
    arg_topic += ['<PAD>'] * (PADDINGS_TOPICS - len(arg_topic))
    arg_token += ['<PAD>'] * (PADDINGS_ARGUMENTS - len(arg_token))
    return arg_topic + arg_token

def predict(arguments, topics):
    data = []
    for i in range(len(arguments)):
        data.append(prepare_data(topics[i], arguments[i]))
    X = []
    for i in range(len(data)):
        tmp = sentence_to_vec(data[i], model_embedding)
        X.append(tmp)
    X = np.array(X)
    y_pred = model.predict(X, verbose=0)
    return y_pred
    

In [104]:
#load le test
df_test = pd.read_csv("hf://datasets/ibm-research/argument_quality_ranking_30k/" + splits["test"])
argument_test = list(df_test.argument)
topic_test = list(df_test.topic)
note_test = np.array(list(df_test.WA)).reshape(-1, 1)

In [None]:
predictions = []
batch_size = 32
acc = 0
stop = 200
for i in range(0, len(topic_test), batch_size):
    batch_topic = topic_test[i:i + batch_size]
    batch_argument = argument_test[i:i + batch_size]
    y_preds = predict(batch_argument, batch_topic)
    predictions.extend(y_preds)
    acc += np.sum(np.abs(y_preds - note_test[i:i + batch_size])) / len(y_preds)
    print("Batch {}: MAE = {}".format(i // batch_size, acc / (i // batch_size + 1)))
    if i >= stop:
        break
print("MAE = {}".format(acc / (i // batch_size + 1)))


    

Batch 0: MAE = 0.1348197033131609
Batch 1: MAE = 0.12478841887783385
Batch 2: MAE = 0.1258744259654185
Batch 3: MAE = 0.12368547179374362
Batch 4: MAE = 0.12532738973574295
Batch 5: MAE = 0.12332791638120875
Batch 6: MAE = 0.12527829892735307
Batch 7: MAE = 0.1277237428530481
MAE = 0.1277237428530481


In [91]:
arg = "a zero tolerance policy means that parents would give complete control of discipline to the school without any regard for family morals and teachings."
top = "We should adopt a zero-tolerance policy in schools"
predict([arg], [top])

[['we', 'should', 'adopt', 'a', 'zero-tolerance', 'policy', 'in', 'schools', '<PAD>', '<PAD>', 'a', 'zero', 'tolerance', 'policy', 'means', 'that', 'parents', 'would', 'give', 'complete', 'control', 'of', 'discipline', 'to', 'the', 'school', 'without', 'any', 'regard', 'for', 'family', 'morals', 'and', 'teachings', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']]
ze

array([[0.88045317]], dtype=float32)

# Data Viz

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Exemple de prédictions
preds = np.random.rand(1000)
classes = np.floor(preds * 10).astype(int)

plt.hist(classes, bins=np.arange(12)-0.5, edgecolor='black', rwidth=0.8)
plt.xticks(range(11))
plt.xlabel("Classe de prédiction (intervalle de 0.1)")
plt.ylabel("Nombre de prédictions")
plt.title("Distribution des classes de prédictions (LSTM)")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()
