In [2]:
import string
import spacy
from gensim.models import keyedvectors
import json
import numpy as np

In [6]:
class Word2Vec:
    def __init__(self,size):
        self.path_model = f"..\models\frwiki_20180420_{size}d.txt.bz2"
        self.nlp = spacy.load("fr_core_news_sm")
        path_categories = r"..\categories.json"
        with open(path_categories, 'r') as file:
            categories_json = json.load(file)
        self.categories = categories_json['categories']

    def train(self):
        self.model_trained=keyedvectors.load_word2vec_format(self.path_model, binary=False)

    def my_doc_2_vec(self,mots,trained):
        #Vecteur moyen d'un ensemble de mots
        p=self.trained.vectors.shape[1]
        vec=np.zeros(p)
        nb=0
        for tk in mots:
            try:
                values=self.trained[tk]
                vec=vec+values
                nb+=1
            except:
                pass
        if nb>0: vec=vec/nb
        return vec

    def get_vect(self,texte):
        #Lemnisation du texte et retourne le vecteur moyen des mots
        doc = self.nlp(texte)
        lemmes = [token.lemma_ for token in doc]
        mots_filtrés = [lemme for lemme in lemmes if lemme not in self.nlp.Defaults.stop_words and lemme not in list(string.punctuation)]
        return self.my_doc_2_vec(mots_filtrés,self.trained)


    def dist(self,word_vector1, word_vector2):
        #Retourne la distance entre deux vecteurs
        return np.dot(word_vector1, word_vector2) / (np.linalg.norm(word_vector1) * np.linalg.norm(word_vector2))

    def get_sorted_cats(self,texte):
        #Retourne les catégories triées par ordre de ressemblance
        vec = self.get_vect(texte)
        cat_arr = np.array([self.trained[cat] for cat in self.categories])
        distances = [self.dist(cat_arr[i],vec) for i in range(len(self.categories))]
        d=[(self.categories[i],distances[i]) for i in range(len(self.categories))]
        d.sort(key=lambda x:-x[1])
        return d