In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pickle
from sentence_transformers import SentenceTransformer
import polars as pl
from sklearn.manifold import TSNE
import google.generativeai as genai

# read lexique database  

In [2]:
# you can download it from http://www.lexique.org/
lexique = pd.read_csv("\\Lexique383.tsv",sep='\t')
# select only words containing the sound /ɛ̃/
sub = lexique.loc[lexique['phon'].str.contains('5')].drop_duplicates('lemme').reset_index(drop=True)
words = sub['ortho'].values

# Get emotionnal valence

In [None]:
# load model from google https://ai.google.dev/gemini-api?hl=fr
genai.configure(api_key="put here your api key")
model = genai.GenerativeModel('gemini-1.5-flash')

In [None]:
# determine prompt based on this recent paper
#https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=Could+large+language+models+estimate+valence+of+words%3F+A+small+ablation+study.+Proceedings+of+CBIC.&btnG=

prompt = "Dans quelle mesure ce mot est-il négatif ou positif sur une échelle de 1 à 9 ? Réponds uniquement avec un chiffre, 1 étant « très négatif » et 9 « très positif ». Voici le mot : "

In [None]:
valences = []
fails_mots = []
fails_defs = []
fails_idxs = []
for num,mot in tqdm(enumerate(mots)):
    response = model.generate_content(prompt+str(mot))
    try:
        valences.append(re.findall(r'\d+', response.text)[0])
    except:
        fails_mots.append(mot)
        fails_defs.append(response)
        fails_idxs.append(num)


In [None]:
# save as a pickle file
moby = { "valences": valences,
        "fails_mots": fails_mots,
        "fails_defs": fails_defs,
        "fails_idxs": fails_idxs}
with open("valences.pickle", 'wb') as handle:
    pickle.dump(moby, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Get example sentences to compute embeddings

In [None]:
# create full database based on the sentences taken from https://huggingface.co/datasets/La-matrice/french_sentences_19M
# we will then select all the sentences that contain the selected words to be able to compute each word averaged embedding
path = "\\sentences_dataset\\"
files = ["0000.parquet","0001.parquet","0002.parquet","0003.parquet","0004.parquet"]
database = []
for file in files:
    dd = pd.read_parquet(path+file, engine='pyarrow')
    database.append(dd)
finaldatabase = pd.concat(database)
# remove duplicates
finaldatabase = finaldatabase.drop_duplicates('text').reset_index(drop=True)
finaldatabase['text'] = ' '+finaldatabase['text']+' '
# create polars dataframe for speed efficiency 
dfi =  pl.from_dataframe(finaldatabase)

In [None]:
# quite time consumming (~ 1 hour)

total_indexes = []
total_text = []
total_word = []
outs = []

for curname in tqdm(words):

    mask = dfi.select(pl.col("text").str.contains(curname))
    curdat = finaldatabase[np.array(mask['text'].to_list())]
    
    #curdat = finaldatabase.loc[finaldatabase['text'].str.contains(curname)]
    if len(curdat)==0:
        outs.append(curname)
    else:
        naames = np.repeat(curname,len(curdat))
        iid = np.arange(1,len(curdat)+1)
        total_indexes.append(iid)
        total_text.append(curdat['text'].values)
        total_word.append(naames)

dico = {
    
    "name": np.concatenate(total_word),
    "individual_index": np.concatenate(total_indexes), 
    "sentence": np.concatenate(total_text)
    
}
dff = pd.DataFrame.from_dict(dico)
dff.to_csv('multiple_sentences_for_embeddings.csv',index=None)

# get averaged embeddings

In [None]:
# initialise LLM for embeddings computation
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

In [None]:
# Take only ten of the sentences for fast action! 
# get averaged embedding for each word based on 10 sentences
fails = []
mean_embedds = []
len_embedds = []
whoo = []
for word in words:
    indiv = data.loc[data['name']==word].reset_index(drop=True)
    if len(indiv)>10:
        iid = np.arange(0,len(indiv))
        indiv = indiv.iloc[np.asarray(random.choices(iid, k =10))]
        
    momo = indiv['name'].values[0]
    
    # retrieve token(s) needed
    token_ids = []
    token_strings = []
    tokenizer = model._first_module().tokenizer
    
    ids = tokenizer.encode(momo)
    strings = tokenizer.convert_ids_to_tokens(ids)
    token_ids.append(ids)
    token_strings.append(strings)
    needed = token_strings[0][1:-1]
    
    fail = []
    embs = []
    for num in tqdm(range(len(indiv))):
        # get each sentence
        sent = indiv.iloc[num]['sentence']
        # get embedding for each token within each sentence
        embeddings = model.encode(sent,output_value = "token_embeddings")
        token_ids2 = []
        token_strings2 = []
        ids2 = tokenizer.encode(sent)
        strings2 = tokenizer.convert_ids_to_tokens(ids2)
        token_ids2.append(ids2)
        token_strings2.append(strings2)
        # make sure no discrepencies
        if (len(embeddings) == len(token_strings2[0])):
    
            # retrieve Token(s) positions 
            L = token_strings2[0]; S= needed.copy()
            n = len(S)
            
            for i in range(len(L)-n + 1):
                if S== L[i:i + n]:
                    start = i
                    end = i+n
            
            if embeddings[start:end,:].shape[0] == len(needed):
                if embeddings[start:end,:].shape[0]>1:
                    final_emb = np.mean(embeddings.numpy()[start:end,:],0)
                else: 
                    final_emb = embeddings.numpy()[start:end,:]
                embs.append(final_emb)
            else: 
                fail.append(num)
        else: 
            fail.append(num)
    fails.append(fail)
    mean_embedds.append(np.mean(embs,0))
    len_embedds.append(len(embs))
    whoo.append(word)

In [None]:
all_embedds = np.array(mean_embedds)

In [None]:
# run TSNE to reduce the dimension of the embedding vectors
tsne = TSNE(n_components=2, random_state=42)
v2d = tsne.fit_transform(all_embedds)
# create a dataframe with the words and their unique embedding
dic2 = {
    'x': v2d2[:, 0],
    'y': v2d2[:, 1],
    'names': whoo}

datadf2 = pd.DataFrame.from_dict(dic2)