In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
nltk.download('wordnet')
from sklearn.neighbors import NearestNeighbors
import nltk

In [3]:
df = pd.read_csv('Ordered_NLP_preprocessed_df.csv')

In [5]:
df.columns

Index(['Id', 'Processed_nom_festival', 'Processed_Type', 'Processed_Region',
       'Processed_Ville', 'Annee', 'Procced_musique',
       'Processed_Spectacle_vivant', 'Processed_Cinema_audiovisuel',
       'Processed_Livre_litterature', 'Geocode', 'Site_internet'],
      dtype='object')

In [6]:
# Nettoyaer les colonnes et créer la colonne 'Soop' qui contient les parametres souhaités pour notre entrainement

# Fonction pour nettoyer les valeurs en excluant les chiffres et les virgules
def clean_value(value):
    if pd.isna(value):  # Vérifie si la valeur est NaN
        return ""
    cleaned_value = re.sub(r'[\d,]', '', str(value))  # Supprime les chiffres et les virgules
    return cleaned_value.strip()  # Supprime les espaces en début et fin de chaîne

# Colonnes à utiliser pour créer la colonne 'Soop'
colonnes_utilisees = ['Processed_nom_festival', 'Processed_Type', 'Processed_Region',
                      'Processed_Ville', 'Procced_musique', 'Processed_Spectacle_vivant',
                      'Processed_Cinema_audiovisuel', 'Processed_Livre_litterature']

# Créer des colonnes temporaires nettoyées
for col in colonnes_utilisees:
    df[f'{col}_clean'] = df[col].apply(clean_value)

# Créer la nouvelle colonne 'Soop' en combinant les valeurs des colonnes temporaires nettoyées
df['Soop'] = df[[f'{col}_clean' for col in colonnes_utilisees]].apply(lambda row: ' '.join(row.values), axis=1)

# Supprimer les colonnes temporaires nettoyées 
df.drop(columns=[f'{col}_clean' for col in colonnes_utilisees], inplace=True)


In [28]:
df['Geocode'].nunique()

1399

In [7]:
df.head(5)

Unnamed: 0,Id,Processed_nom_festival,Processed_Type,Processed_Region,Processed_Ville,Annee,Procced_musique,Processed_Spectacle_vivant,Processed_Cinema_audiovisuel,Processed_Livre_litterature,Geocode,Site_internet,Soop
0,FEST_37011_550,avoine zone groove,musique,centre-val loire,avoine,2000,"jazz , blues",,,,"47.2237767737, 0.18754463005",www.avoinezonegroove.fr,avoine zone groove musique centre-val loire av...
1,FEST_78124_1288,macki music festival,musique,île-de-france,carrières-sur-seine,2014,"musiques monde , musiques traditionnelles , rn...",,,,"48.9119335064, 2.17838235323",https://www.mackimusicfestival.fr/,macki music festival musique île-de-france car...
2,FEST_76410_3674,d'la samba épinards,musique,normandie,maromme,2019,,,,,"49.4769434634, 1.03230514083",https://www.dlasambadanslesepinards.com,d'la samba épinards musique normandie maromme
3,FEST_13055_3705,mots étoiles,spectacle vivant,provence-alpes-côte d'azur,marseille,2003,,,,,"43.296346, 5.369889",https://desmotsdesetoiles.fr,mots étoiles spectacle vivant provence-alpes-c...
4,FEST_87107_4416,imaginieul,spectacle vivant,nouvelle-aquitaine,nieul,2010,,théâtre,,,"45.9214559183, 1.18018067279",http://www.aurora-illusia.com/,imaginieul spectacle vivant nouvelle-aquitaine...


# Vectorisation

In [8]:
bow_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

X = df["Soop"]

bow_vectorizer.fit(X)
tfidf_vectorizer.fit(X)

X_bow = bow_vectorizer.transform(X)
X_tfidf = tfidf_vectorizer.transform(X)

In [11]:
pd.DataFrame(X_bow.toarray(), columns = bow_vectorizer.get_feature_names_out())

Unnamed: 0,abbaye,abbayes,aborigène,abracadagrasses,abrazo,abriès,academy,acadienne,académie,accordéon,...,éte,étienne,étoiles,étrangers,été,évette,évron,île,îlots,ïle
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1956,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1957,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1958,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1959,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
pd.DataFrame(X_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names_out())

Unnamed: 0,abbaye,abbayes,aborigène,abracadagrasses,abrazo,abriès,academy,acadienne,académie,accordéon,...,éte,étienne,étoiles,étrangers,été,évette,évron,île,îlots,ïle
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.176729,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.487860,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
1957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.298269,0.0,0.22064,0.0,0.0,0.000000,0.0,0.0
1958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
1959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0


# Machine Learning


In [13]:
modelNN_b = NearestNeighbors(n_neighbors=3, metric= 'cosine')

modelNN_t = NearestNeighbors(n_neighbors=3, metric= 'cosine')

modelNN_b.fit(X_bow)
modelNN_t.fit(X_tfidf)

In [14]:
stop_word = stopwords.words('french')
stop_word += stopwords.words('english')
lemmatizer = WordNetLemmatizer()


In [15]:
def tok(sentence):
    return nltk.word_tokenize(sentence.lower())

def no_stop(tokens):
    return [token for token in tokens if (token not in stop_word)]

def stem(tokens, language= 'french'):
    stemmizer = SnowballStemmer(language=language)
    return [stemmizer.stem(token) for token in tokens]

def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def preprocess(sentence, stemm=True, lemm=True, stop=True):
    tokens = tok(sentence)
    if stop:
        tokens = no_stop(tokens)
    if lemm:
        tokens = lemmatize(tokens)
    if stemm and not lemm:  # On ne fait pas le stemming si la lemmatisation est activée
        tokens = stem(tokens)
    return ' '.join(tokens)


In [16]:
tfidf_vectorizer.get_feature_names_out()

array(['abbaye', 'abbayes', 'aborigène', ..., 'île', 'îlots', 'ïle'],
      dtype=object)

In [17]:
N = pd.Series(preprocess("solidays"))
N_bow = bow_vectorizer.transform(N).toarray()
pd.DataFrame(N_bow, columns = bow_vectorizer.get_feature_names_out())

Unnamed: 0,abbaye,abbayes,aborigène,abracadagrasses,abrazo,abriès,academy,acadienne,académie,accordéon,...,éte,étienne,étoiles,étrangers,été,évette,évron,île,îlots,ïle
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
modelNN_t.kneighbors(N_bow)

In [19]:
modelNN_b.kneighbors(X_bow)[0]

array([[0.        , 0.3907282 , 0.41666667],
       [0.        , 0.18307827, 0.22218254],
       [0.        , 0.43804851, 0.48968964],
       ...,
       [0.        , 0.48968964, 0.52371033],
       [0.        , 0.18181818, 0.21665055],
       [0.        , 0.19935923, 0.21227364]])

In [20]:
modelNN_t.kneighbors(X_tfidf)[0]

array([[0.        , 0.75738782, 0.77309245],
       [0.        , 0.45423946, 0.49644288],
       [0.        , 0.74534819, 0.76954803],
       ...,
       [0.        , 0.79006008, 0.79487034],
       [0.        , 0.54899278, 0.55039444],
       [0.        , 0.58195991, 0.5837756 ]])

In [21]:
# Prétraitement (assurez-vous que les fonctions de prétraitement sont définies)
df['Cleaned_Soop'] = df['Soop'].apply(lambda x: preprocess(x, stemm=True, lemm=False))

In [26]:
# Initialisation du vectorizer et transformation des données
X_tfidf = tfidf_vectorizer.transform(df['Cleaned_Soop'])
# Initialisation et entraînement du modèle KNN
modelNN_t = NearestNeighbors(n_neighbors=3, metric= 'cosine')
modelNN_t.fit(X_tfidf)
# Définir la fonction pour trouver les entrées les plus proches
def find_closest_entries(query, vectorizer, knn_model, df, n_neighbors=5):
    # Prétraiter la requête
    query_cleaned = preprocess(query, stemm=True, lemm=False)
    
    # Transformer la requête en sac de mots
    query_tfidf = vectorizer.transform([query_cleaned]).toarray()
    
    # Trouver les entrées les plus proches
    distances, indices = knn_model.kneighbors(query_tfidf, n_neighbors=n_neighbors)
    
    # Extraire les entrées similaires
    similar_entries = df.iloc[indices[0]]
    return similar_entries

query = input("Entrez une description du festival : ")
closest_entries = find_closest_entries(query, tfidf_vectorizer, modelNN_t, df, n_neighbors=5)
print("Les entrées les plus proches :")
closest_entries

Les entrées les plus proches :


Unnamed: 0,Id,Processed_nom_festival,Processed_Type,Processed_Region,Processed_Ville,Annee,Procced_musique,Processed_Spectacle_vivant,Processed_Cinema_audiovisuel,Processed_Livre_litterature,Geocode,Site_internet,Soop,Cleaned_Soop
1306,FEST_82169_5617,festival voix lieux ... mondes,musique,occitanie,saint-nicolas-de-la-grave,1996,,,,,"44.0658425315, 1.02023020877",https://www.facebook.com/festivaldesvoix/,festival voix lieux ... mondes musique occitan...,festival voix lieux ... mond musiqu occitan sa...
1304,FEST_83115_5786,magie orgues,musique,provence-alpes-côte d'azur,sainte-maxime,2010,musique savante,,,,"43.3564730986, 6.61172790208",www.ville-sainte-maxime.fr,magie orgues musique provence-alpes-côte d'azu...,mag orgu musiqu provence-alpes-côt d'azur sain...
1308,FEST_64499_6197,fête sottises,spectacle vivant,nouvelle-aquitaine,salies-de-béarn,2009,,arts rue,,,"43.4683239252, -0.917789447505",http://www.lacaze-aux-sottises.org/,fête sottises spectacle vivant nouvelle-aquita...,fêt sottis spectacl viv nouvelle-aquitain sali...
1305,FEST_87154_5954,muse scène,spectacle vivant,nouvelle-aquitaine,saint-junien,2018,,spectacle vivant pluridisciplinaire,,,"45.9024520048, 0.887855844778",http://www.facebook.com/pontlevis87/,muse scène spectacle vivant nouvelle-aquitaine...,mus scen spectacl viv nouvelle-aquitain saint-...
1310,FEST_40304_6389,latinossegor,spectacle vivant,nouvelle-aquitaine,soorts-hossegor,2001,,spectacle vivant pluridisciplinaire,,,"43.6675688674, -1.41245896656",www.facebook.com/latinossegor,latinossegor spectacle vivant nouvelle-aquitai...,latinossegor spectacl viv nouvelle-aquitain so...


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.neighbors import NearestNeighbors

# Téléchargement des ressources nécessaires
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Définition des stopwords et lemmatizer
stop_words = set(stopwords.words('french'))
lemmatizer = WordNetLemmatizer()

# Fonction pour tokeniser la phrase
def tok(sentence):
    return nltk.word_tokenize(sentence.lower())

# Fonction pour supprimer les stopwords
def no_stop(tokens):
    return [token for token in tokens if (token not in stop_words)]

# Fonction pour stemmer les tokens
def stem(tokens, language='french'):
    stemmer = SnowballStemmer(language=language)
    return [stemmer.stem(token) for token in tokens]

# Fonction pour lemmatiser les tokens
def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# Fonction de prétraitement qui peut stemmer, lemmatiser et supprimer les stopwords
def preprocess(sentence, stemm=True, lemm=False, stop=True):
    tokens = tok(sentence)
    if stop:
        tokens = no_stop(tokens)
    if lemm:
        tokens = lemmatize(tokens)
    if stemm and not lemm:  # On ne fait pas le stemming si la lemmatisation est activée
        tokens = stem(tokens)
    return ' '.join(tokens)



[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


cec exempl phras trait stemming lemmatis .
ceci exemple phrase traiter stemming lemmatisation .


In [17]:
import folium

In [25]:
# Appliquer le nettoyage avec le prétraitement avancé
df['Cleaned_Soop'] = df['Soop'].apply(lambda x: preprocess(x, stemm=True, lemm=False))

# Vectoriser le texte
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X = vectorizer.fit_transform(df['Cleaned_Soop'])

# Initialiser et entraîner le modèle KNN
knn = NearestNeighbors(n_neighbors=3, metric= 'cosine')
knn.fit(X)

# Fonction pour trouver des entrées similaires
def find_similar_entries(query, df, vectorizer, knn_model, n_neighbors=4):
    query_cleaned = preprocess(query, stemm=True, lemm=False)
    query_vec = vectorizer.transform([query_cleaned])
    distances, indices = knn_model.kneighbors(query_vec, n_neighbors=n_neighbors)
    similar_entries = df.iloc[indices[0]]
    return similar_entries

# Demander à l'utilisateur d'entrer une requête
query = input("Entrez une description du festival : ")

# Trouver les entrées similaires
similar_entries = find_similar_entries(query, df, vectorizer, knn, n_neighbors=4)

# Afficher les résultats
print("Les 4 festivals les plus pertinents :")
print(similar_entries)

carte = folium.Map(location=[46.603354, 1.888334], zoom_start=6)

# Ajouter des marqueurs pour chaque position géographique valide
for index, row in similar_entries.iterrows():
    geocode = row['Geocode'].split(',')
    if len(geocode) == 2:  # Vérifiez que vous avez exactement deux parties après split
        lat = float(geocode[0].strip())
        lon = float(geocode[1].strip())
        nom_festival = row['Processed_nom_festival']
        folium.Marker(location=[lat, lon], popup=nom_festival).add_to(carte)

# Enregistrer et afficher la carte
carte.save("carte_france.html")  # Enregistre la carte au format HTML
carte

Les 4 festivals les plus pertinents :
                   Id Processed_nom_festival        Processed_Type  \
39    FEST_44109_4308      scènes vagabondes               musique   
1212  FEST_44109_4307         rendez l'erdre               musique   
833   FEST_44109_4322      sofilm summercamp  cinéma , audiovisuel   
764   FEST_44109_4288           heures d'été               musique   

     Processed_Region Processed_Ville Annee Procced_musique  \
39         pays loire          nantes  2015             NaN   
1212       pays loire          nantes  1987             NaN   
833        pays loire          nantes  2015             NaN   
764        pays loire          nantes  2005  musiques monde   

     Processed_Spectacle_vivant             Processed_Cinema_audiovisuel  \
39                          NaN                                      NaN   
1212                        NaN                                      NaN   
833                         NaN  fiction long métrage , film d'anim