In [35]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np

In [21]:
from tqdm import tqdm
tqdm.pandas()

In [2]:
! python -m spacy download fr_core_news_sm
import fr_core_news_sm

nlp = fr_core_news_sm.load()

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
path = "/content/drive/MyDrive/Paris 0924 - Data Analyst/Live_Coding/s13_nlp/text_similarity/"

In [5]:
df_ingredients = pd.read_csv(path + "ingredients_clean.csv" )

In [6]:
print(df_ingredients.head().to_markdown())

|    | id                       | name                                              | URL                                             | Description                                                                             |   Cooking time |   Preparation extra time per cover |   Covers count | ingredients                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | ingredients_name                                                                          |
|---:|:-------------------------|:-----------------------

In [7]:
print(df_ingredients.shape) #nbr de ligne

(1838, 9)


In [8]:
print(df_ingredients.id.nunique()) #nbr de recette

1838


In [9]:
print(df_ingredients.head().to_markdown())

|    | id                       | name                                              | URL                                             | Description                                                                             |   Cooking time |   Preparation extra time per cover |   Covers count | ingredients                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | ingredients_name                                                                          |
|---:|:-------------------------|:-----------------------

In [10]:
#pas de nan pour les colonnes utilisé par le model
print(df_ingredients.isna().sum())

id                                    0
name                                  0
URL                                   0
Description                           0
Cooking time                        126
Preparation extra time per cover     97
Covers count                          0
ingredients                           0
ingredients_name                      0
dtype: int64


In [11]:
#on va créer une colonne contenant toutes les inforations texte nécessaire pour faire la recommandation par mot clé


# les colonnes intéressantes sont : name, description, ingredients_name

df_ingredients['all_text'] = df_ingredients['name'] + " ; " + df_ingredients['Description'] + " ; " + df_ingredients['ingredients_name']

In [12]:
print(df_ingredients.head().to_markdown())

|    | id                       | name                                              | URL                                             | Description                                                                             |   Cooking time |   Preparation extra time per cover |   Covers count | ingredients                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | ingredients_name                                                                          | all_text                                                 

In [18]:


def clean_text(text):
    # rendre tous le text en minuscule : par ex "Good" et "good" deviennent equivalent
    text = text.lower()


    #a appliquer la grammaire à votre text
    text_spacy = nlp(text)
    # definir une chaine vide permettant de stoker les token séléctionné
    text_clean = ""
    #parcourir chaque token du tetx
    for token in text_spacy:
        #but est d'enlever les token qui ne sert pas à grand chose
        # considérer les token alphabethique , enlever les stopwrods , enlever digit et ponctuation
        #aussi contenant au moins 2 charactere
        if token.is_alpha and token.is_stop == False and token.like_num == False:
            #print(token, token.tag_, token.lemma_)

            text_clean += token.lemma_ +  " "
    # enlever les espace en double
    text_clean = re.sub("\s+", " ", text_clean)
    return text_clean


In [20]:
ex = df_ingredients.loc[1,"all_text"]
print(ex)
text_clean = clean_text(ex)
print(text_clean)

Loaded eggs fries ; Des frites croustillantes recouvertes de fromage fondant, de bacon & d'un œuf au plat ! ; Frites surgelées,Œuf,Lard (tranches),Cheddar (tranches),Ciboulette
loaded eggs frie frite croustillant recouverte fromage fondre bacon œuf plat frit surgeler œuf lard 


In [22]:

df_ingredients['all_text_clean'] = df_ingredients['all_text'].progress_apply(clean_text)

100%|██████████| 1838/1838 [00:35<00:00, 51.07it/s]


In [23]:
print(df_ingredients.head().to_markdown())

|    | id                       | name                                              | URL                                             | Description                                                                             |   Cooking time |   Preparation extra time per cover |   Covers count | ingredients                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | ingredients_name                                                                          | all_text                                                 

In [28]:
print(df_ingredients.head().to_markdown())

|    | id                       | name                                              | URL                                             | Description                                                                             |   Cooking time |   Preparation extra time per cover |   Covers count | ingredients                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | ingredients_name                                                                          | all_text                                                 

In [25]:
tfidf = TfidfVectorizer()

In [26]:
X_tfidf = tfidf.fit_transform(df_ingredients.all_text_clean)

In [29]:
cos_sim = cosine_similarity(X_tfidf,X_tfidf)

In [30]:
cos_sim

array([[1.        , 0.04957372, 0.08244531, ..., 0.04768244, 0.02957583,
        0.02988163],
       [0.04957372, 1.        , 0.02797276, ..., 0.01932518, 0.0195348 ,
        0.        ],
       [0.08244531, 0.02797276, 1.        , ..., 0.01100637, 0.01112575,
        0.01124079],
       ...,
       [0.04768244, 0.01932518, 0.01100637, ..., 1.        , 0.04401421,
        0.02649913],
       [0.02957583, 0.0195348 , 0.01112575, ..., 0.04401421, 1.        ,
        0.25199356],
       [0.02988163, 0.        , 0.01124079, ..., 0.02649913, 0.25199356,
        1.        ]])

In [34]:
#quelle recommandation ppour
#"Rolls saumon & avocat"
input_ = "Rolls saumon & avocat"
info_input = df_ingredients[df_ingredients.name== input_]
indice_input = info_input.index[0]
print(indice_input)

2


In [42]:
scores = cos_sim[indice_input]
max_reco = 5
indices_reco = np.argsort(scores)[::-1][1:max_reco + 1]
print(indices_reco)
df_reco = df_ingredients.iloc[indices_reco]
print(df_reco.head().to_markdown())

[1127 1102 1179 1280  831]
|      | id                       | name                        | URL                                             | Description                                                                        |   Cooking time |   Preparation extra time per cover |   Covers count | ingredients                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | ingredients_name                                                                                             | all_text                       

# embedding models

In [None]:
#créer en fonction
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")