In [1]:
import pandas as pd
import seaborn as sns
from pathlib import Path
from nltk.corpus import wordnet
import nltk
import re
from collections import Counter
import string

In [2]:
syns=[synset.lemma_names('fra') for synset in wordnet.synsets('bien', lang='fra')]

In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("gilf/french-camembert-postag-model")
model = AutoModelForTokenClassification.from_pretrained("gilf/french-camembert-postag-model")

nlp_token_class = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)
nlp_token_class('Face à un choc inédit, les mesures mises en place par le gouvernement ont permis une protection forte et efficace des ménages')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'U',
  'score': 0.99467033,
  'word': 'Face',
  'start': 0,
  'end': 4},
 {'entity_group': 'P', 'score': 0.99961555, 'word': 'à', 'start': 4, 'end': 6},
 {'entity_group': 'DET',
  'score': 0.9995907,
  'word': 'un',
  'start': 6,
  'end': 9},
 {'entity_group': 'NC',
  'score': 0.99955326,
  'word': 'choc',
  'start': 9,
  'end': 14},
 {'entity_group': 'ADJ',
  'score': 0.9991835,
  'word': 'inédit',
  'start': 14,
  'end': 21},
 {'entity_group': 'P',
  'score': 0.37106535,
  'word': ',',
  'start': 21,
  'end': 22},
 {'entity_group': 'DET',
  'score': 0.99959034,
  'word': 'les',
  'start': 22,
  'end': 26},
 {'entity_group': 'NC',
  'score': 0.99956495,
  'word': 'mesures',
  'start': 26,
  'end': 34},
 {'entity_group': 'VPP',
  'score': 0.99886703,
  'word': 'mises',
  'start': 34,
  'end': 40},
 {'entity_group': 'P',
  'score': 0.9996246,
  'word': 'en',
  'start': 40,
  'end': 43},
 {'entity_group': 'NC',
  'score': 0.99953294,
  'word': 'place',
  'start': 43,
  

In [19]:
df = pd.read_csv('datasets/sentence_dataset.csv', sep='|', index_col=0).dropna().sample(30)

In [20]:
def cleaning(text):
    text = re.sub("~@~Y", "'", text)
    text = re.sub( "(@\w*\\b\s?|#\w*\\b\s?|&\w*\\b\s?|\n\s?|\\\\|\<|\>|\||\*)", "", text)
    text = re.sub("\/", "", text)
    text = re.sub("l'", "le ", text)
    text = re.sub("d'", "de ", text)
    text = re.sub("j'", "je ", text)
    text = re.sub("qu'", "que ", text)
    text = re.sub("t'", "te ", text)
    text = re.sub("c'", "ce ", text)
    text = text.lower()
    text = text.strip()
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub( ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return (text)

In [21]:
df['cleaned'] = df.text.apply(cleaning)
df['tokenized'] = df.cleaned.apply(tokenizer.tokenize)

In [22]:
df_proust = df[df.label == 1]
df_news = df[df.label == 0]

In [78]:
def to_freq_table(dataframe):
    out = pd.DataFrame.from_dict(Counter(dataframe.tokenized.explode().to_list()), orient='index')
    out = out.rename(columns={'index':'token', 0:"freq"}).sort_values('freq', ascending=False)
    return out

In [58]:
proust = to_freq_table(df_proust)
news = to_freq_table(df_news)

In [25]:
def get_pos(word):
    pass

def get_right_syns(word):
    pass

def get_most_common_corpus_syn(word):
    pass

def transfer_style_sentence(sentence):
    decomposition = nlp_token_class(sentence)
    for word in decomposition:
        

IndentationError: expected an indented block (4003056971.py, line 14)

In [49]:
# TODO lemmatize then delemmatize ?
from nltk.corpus import wordnet as wn

In [47]:
def get_corres_pos(camembert_pos):
    if camembert_pos == 'NC':
        return wn.NOUN
    if camembert_pos[0] == 'V':
        return wn.VERB
    if camembert_pos[:3] == 'ADJ':
        return wn.ADJ
    if camembert_pos[:3] == 'ADV':
        return wn.ADV
    return None

In [71]:
news

Unnamed: 0,freq
▁livres,1
▁he,1
b,1
do,1
▁oct,1
...,...
icité,1
▁vigueur,1
▁secteur,1
▁bien,1


In [95]:
sentence = df.iloc[0].cleaned
tagged = nlp_token_class(sentence)
new_sentence = []
for word in tagged:
    pos = get_corres_pos(word['entity_group'])
    if pos is None:
        new_sentence.append(word['word'])
    else:
        syns = [synset.lemma_names('fra') for synset in wordnet.synsets(word['word'], lang='fra', pos=pos)]
        if len(syns) == 0:
            new_sentence.append(word['word'])
        else:
            syns = list(set([word for syn in syns for word in syn]))
            syns = {''.join(tokenizer.tokenize(word)): word for word in syns}
            good_tokens = news.index.intersection([syn for syn in syns.keys()])
            if len(good_tokens) == 0:
                new_sentence.append(word['word'])
            else:
                most_common_syn = news.loc[good_tokens].iloc[0].name
                new_sentence.append(syns[most_common_syn])
print(sentence)
print(' '.join(new_sentence))

 livres hebdo october des pseudonymes et du second degré les contributeurs dont le crayon porte dordinaire plutôt à gauche ont tous retourné leurs vestes pour loccasion et se sont mis dans la peau de fervents supporters de nicolas sarkozy à quelques semaines des primaires
livres hebdo october des pseudonymes et du second degré les contributeurs dont le crayon porte dordinaire plutôt à gauche ont tous retourné leurs vestes pour loccasion et se sont mis dans la peau de fervents supporters de nico las sarkozy à quelques semaines des primaires


In [None]:
.index.item()

In [43]:
wordnet.synsets?

In [None]:
wn.VERB)
[Synset('chase.v.01')]

The other parts of speech are NOUN, ADJ and ADV