In [1]:
import pandas as pd
import numpy as np

In [2]:
# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'train.csv', 'validation': 'dev.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/ibm-research/argument_quality_ranking_30k/" + splits["train"])

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import pandas as pd
import io
import re

with open("./datasets_chatgpt.csv", encoding="utf-8") as f:
    raw = f.read().replace("“", '"').replace("”", '"')

df_augmented_chat_gpt = pd.read_csv(
    io.StringIO(raw),
    sep=r',(?=(?:[^"]*"[^"]*")*[^"]*$)',
    engine="python"
)

for col in ["argument", "topic"]:
    df_augmented_chat_gpt[col] = (
        df_augmented_chat_gpt[col]
        .str.strip('"')
        .str.strip()
    )

# Vérification
df_augmented_chat_gpt.head()

Unnamed: 0,argument,topic
0,Social media platforms have fueled the spread ...,Social media is harmful to society
1,Online networks encourage shallow relationship...,Social media is harmful to society
2,The addictive design of social media steals ti...,Social media is harmful to society
3,Social media allows people to connect across v...,Social media is harmful to society
4,These platforms can be powerful tools for educ...,Social media is harmful to society


In [12]:
argument = list(df.argument) + list(df_augmented_chat_gpt.argument)
topic = list(df.topic) + list(df_augmented_chat_gpt.topic)

In [13]:
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
PADDINGS_TOPICS = 10
PADDINGS_ARGUMENTS = 100

nltk.download('punkt')

data = []
for i in range(len(argument)):
    arg_token = word_tokenize(argument[i].lower())
    arg_topic = word_tokenize(topic[i].lower())
    if len(arg_topic) < PADDINGS_TOPICS:
        arg_topic += ['<PAD>'] * (PADDINGS_TOPICS - len(arg_topic))
    if len(arg_token) < PADDINGS_ARGUMENTS:
        arg_token += ['<PAD>'] * (PADDINGS_ARGUMENTS - len(arg_token))

    data.append(arg_topic + arg_token)

model = Word2Vec(sentences=data, vector_size=100, window=5, min_count=1, workers=4)

print("Exemple de vocabulaire :", list(model.wv.key_to_index.keys())[:10])

[nltk_data] Downloading package punkt to /home/dimitri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Exemple de vocabulaire : ['<PAD>', 'should', 'we', '.', 'the', 'to', 'and', 'of', 'a', 'be']


In [14]:
test_data  = pd.Series(["What"])

# Tokenisation de chaque phrase en minuscules
test_sentences = test_data.apply(lambda x: word_tokenize(x.lower())).tolist()

# Fonction pour calculer un vecteur moyen pour une phrase
def sentence_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    return None

# Calculer le vecteur moyen pour chaque phrase du dataset Test
test_vectors = [sentence_vector(sentence, model) for sentence in test_sentences]

In [15]:
def decode_embedding(embedding, model, topn=10):
    return model.wv.similar_by_vector(embedding, topn=topn)

# Décoder le vecteur moyen de la première phrase du dataset Test
decoded_words = decode_embedding(test_vectors[0], model)

# Affichage des mots les plus similaires
print("Mots les plus similaires au vecteur de la phrase Test :", decoded_words)

Mots les plus similaires au vecteur de la phrase Test : [('what', 1.0000001192092896), ('whatever', 0.7845554947853088), ('how', 0.7578881978988647), ('going', 0.7202796936035156), ('something', 0.7185078263282776), ('nobody', 0.6992220282554626), ('everything', 0.6940007209777832), ('someone', 0.6769415140151978), ('passionate', 0.659133791923523), ('th', 0.6571049094200134)]


In [16]:
def synonyms(word, model, topn=10):
    """
    Find synonyms for a given word using the Word2Vec model.
    """
    list_syno = decode_embedding(model.wv[word], model, topn)
    return list_syno[1][0]
def sentence_synonyms(sentence, model, topn=10):
    """
    Find synonyms for each word in a sentence using the Word2Vec model.
    """
    res = []
    sentence = sentence.lower().split()
    for word in sentence:
        if word in model.wv:
            res.append(synonyms(word, model, topn))
        else:
            res.append(word)
    return res

In [17]:
sentence_synonyms("how old are you", model, 10)

['whatever', 'perfect', 'were', 'they']