In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/mac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv('Ordered_NLP_preprocessed_df.csv')

In [3]:
df.columns

Index(['Id', 'Processed_nom_festival', 'Processed_Type', 'Processed_Region',
       'Processed_Ville', 'Annee', 'Procced_musique',
       'Processed_Spectacle_vivant', 'Processed_Cinema_audiovisuel',
       'Processed_Livre_litterature', 'Geocode', 'Site_internet'],
      dtype='object')

In [3]:
# Nettoyaer les colonnes et créer la colonne 'Soop' qui contient les parametres souhaités pour notre entrainement

# Fonction pour nettoyer les valeurs en excluant les chiffres et les virgules
def clean_value(value):
    if pd.isna(value):  # Vérifie si la valeur est NaN
        return ""
    cleaned_value = re.sub(r'[\d,]', '', str(value))  # Supprime les chiffres et les virgules
    return cleaned_value.strip()  # Supprime les espaces en début et fin de chaîne

# Colonnes à utiliser pour créer la colonne 'Soop'
colonnes_utilisees = ['Processed_nom_festival', 'Processed_Type', 'Processed_Region',
                      'Processed_Ville', 'Procced_musique', 'Processed_Spectacle_vivant',
                      'Processed_Cinema_audiovisuel', 'Processed_Livre_litterature']

# Créer des colonnes temporaires nettoyées
for col in colonnes_utilisees:
    df[f'{col}_clean'] = df[col].apply(clean_value)

# Créer la nouvelle colonne 'Soop' en combinant les valeurs des colonnes temporaires nettoyées
df['Soop'] = df[[f'{col}_clean' for col in colonnes_utilisees]].apply(lambda row: ' '.join(row.values), axis=1)

# Supprimer les colonnes temporaires nettoyées 
df.drop(columns=[f'{col}_clean' for col in colonnes_utilisees], inplace=True)


In [5]:
df.head(5)

Unnamed: 0,Id,Processed_nom_festival,Processed_Type,Processed_Region,Processed_Ville,Annee,Procced_musique,Processed_Spectacle_vivant,Processed_Cinema_audiovisuel,Processed_Livre_litterature,Geocode,Site_internet,Soop
0,FEST_37011_550,avoine zone groove,musique,centre-val loire,avoine,2000,"jazz , blues",,,,"47.2237767737, 0.18754463005",www.avoinezonegroove.fr,avoine zone groove musique centre-val loire av...
1,FEST_78124_1288,macki music festival,musique,île-de-france,carrières-sur-seine,2014,"musiques monde , musiques traditionnelles , rn...",,,,"48.9119335064, 2.17838235323",https://www.mackimusicfestival.fr/,macki music festival musique île-de-france car...
2,FEST_76410_3674,d'la samba épinards,musique,normandie,maromme,2019,,,,,"49.4769434634, 1.03230514083",https://www.dlasambadanslesepinards.com,d'la samba épinards musique normandie maromme
3,FEST_13055_3705,mots étoiles,spectacle vivant,provence-alpes-côte d'azur,marseille,2003,,,,,"43.296346, 5.369889",https://desmotsdesetoiles.fr,mots étoiles spectacle vivant provence-alpes-c...
4,FEST_87107_4416,imaginieul,spectacle vivant,nouvelle-aquitaine,nieul,2010,,théâtre,,,"45.9214559183, 1.18018067279",http://www.aurora-illusia.com/,imaginieul spectacle vivant nouvelle-aquitaine...


# Vectorisation

In [99]:
bow_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

X = df["Soop"]

bow_vectorizer.fit(X)
tfidf_vectorizer.fit(X)

X_bow = bow_vectorizer.transform(X)
X_tfidf = tfidf_vectorizer.transform(X)

In [7]:
tfidf_vectorizer.get_feature_names_out()

array(['abbaye', 'abbayes', 'aborigène', ..., 'île', 'îlots', 'ïle'],
      dtype=object)

In [8]:
pd.DataFrame(X_bow.toarray(), columns = bow_vectorizer.get_feature_names_out())

Unnamed: 0,abbaye,abbayes,aborigène,abracadagrasses,abrazo,abriès,academy,acadienne,académie,accordéon,...,éte,étienne,étoiles,étrangers,été,évette,évron,île,îlots,ïle
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1956,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1957,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1958,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1959,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
pd.DataFrame(X_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names_out())

Unnamed: 0,abbaye,abbayes,aborigène,abracadagrasses,abrazo,abriès,academy,acadienne,académie,accordéon,...,éte,étienne,étoiles,étrangers,été,évette,évron,île,îlots,ïle
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.176729,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.487860,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
1957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.298269,0.0,0.22064,0.0,0.0,0.000000,0.0,0.0
1958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0
1959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0


# Machine Learning


In [4]:
from sklearn.neighbors import NearestNeighbors

In [100]:
modelNN_b = NearestNeighbors(n_neighbors=3, metric= 'cosine')

modelNN_t = NearestNeighbors(n_neighbors=3, metric= 'cosine')

modelNN_b.fit(X_bow)
modelNN_t.fit(X_tfidf)

In [103]:
stop_word = stopwords.words('french')
stop_word += stopwords.words('english')
lemmatizer = WordNetLemmatizer()


In [124]:
def tok(sentence):
    return nltk.word_tokenize(sentence.lower())

def no_stop(tokens):
    return [token for token in tokens if (token not in stop_word)]

def stem(tokens, language= 'french'):
    stemmizer = SnowballStemmer(language=language)
    return [stemmizer.stem(token) for token in tokens]

def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def preprocess(sentence, stemm=True, lemm=True, stop=True):
    tokens = tok(sentence)
    if stop:
        tokens = no_stop(tokens)
    if lemm:
        tokens = lemmatize(tokens)
    if stemm and not lemm:  # On ne fait pas le stemming si la lemmatisation est activée
        tokens = stem(tokens)
    return ' '.join(tokens)
    


In [None]:
N = pd.Series(preprocess("solidays"))
N_bow = bow_vectorizer.transform(N).toarray()
pd.DataFrame(N_bow, columns = bow_vectorizer.get_feature_names_out())

In [None]:
modelNN_t.kneighbors(N_bow)

In [None]:
modelNN_b.kneighbors(X_bow)[0]

In [None]:
modelNN_t.kneighbors(X_tfidf)[0]

In [None]:
'''# Créer une carte centrée sur la France
import folium
# Assurer que toutes les données de 'Geocode' sont des chaînes
df['Geocode'] = df['Geocode'].astype(str)

# Filtrer les entrées mal formatées ou avec 'nan' (généré par None en str)
df = df[df['Geocode'].str.contains(',')]

# Créer une carte centrée sur la France
carte = folium.Map(location=[46.603354, 1.888334], zoom_start=6)

# Ajouter des marqueurs pour chaque position géographique valide
for index, row in df.iterrows():
    geocode = row['Geocode'].split(',')
    if len(geocode) == 2:  # Vérifiez que vous avez exactement deux parties après split
        lat = float(geocode[0].strip())
        lon = float(geocode[1].strip())
        nom_festival = row['Processed_nom_festival']
        folium.Marker(location=[lat, lon], popup=nom_festival).add_to(carte)

# Afficher la carte
carte.save("carte_france.html")  # Enregistre la carte au format HTML
carte  # Affiche la carte si vous êtes dans un environnement Jupyter Notebook'''

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer

# Téléchargement des ressources nécessaires
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Définition des stopwords et lemmatizer
stop_words = set(stopwords.words('french'))
lemmatizer = WordNetLemmatizer()

# Fonction pour tokeniser la phrase
def tok(sentence):
    return nltk.word_tokenize(sentence.lower())

# Fonction pour supprimer les stopwords
def no_stop(tokens):
    return [token for token in tokens if (token not in stop_words)]

# Fonction pour stemmer les tokens
def stem(tokens, language='french'):
    stemmer = SnowballStemmer(language=language)
    return [stemmer.stem(token) for token in tokens]

# Fonction pour lemmatiser les tokens
def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# Fonction de prétraitement qui peut stemmer, lemmatiser et supprimer les stopwords
def preprocess(sentence, stemm=True, lemm=False, stop=True):
    tokens = tok(sentence)
    if stop:
        tokens = no_stop(tokens)
    if lemm:
        tokens = lemmatize(tokens)
    if stemm and not lemm:  # On ne fait pas le stemming si la lemmatisation est activée
        tokens = stem(tokens)
    return ' '.join(tokens)

# Exemple d'utilisation de la fonction de prétraitement
sentence = "Ceci est un exemple de phrase à traiter avec le stemming et la lemmatisation."
print(preprocess(sentence, stemm=True, lemm=False))  # Avec stemming
print(preprocess(sentence, stemm=False, lemm=True))  # Avec lemmatisation


[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


cec exempl phras trait stemming lemmatis .
ceci exemple phrase traiter stemming lemmatisation .


In [16]:
# Appliquer le nettoyage avec le prétraitement avancé
df['Cleaned_Soop'] = df['Soop'].apply(lambda x: preprocess(x, stemm=True, lemm=False))

# Vectoriser le texte
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X = vectorizer.fit_transform(df['Cleaned_Soop'])

# Initialiser et entraîner le modèle KNN
knn = NearestNeighbors(n_neighbors=3, metric= 'cosine')
knn.fit(X)

# Fonction pour trouver des entrées similaires
def find_similar_entries(query, df, vectorizer, knn_model, n_neighbors=4):
    query_cleaned = preprocess(query, stemm=True, lemm=False)
    query_vec = vectorizer.transform([query_cleaned])
    distances, indices = knn_model.kneighbors(query_vec, n_neighbors=n_neighbors)
    similar_entries = df.iloc[indices[0]]
    return similar_entries

# Exemple d'utilisation
query = "nantes électronique"
similar_entries = find_similar_entries(query, df, vectorizer, knn)
print(similar_entries)


                   Id Processed_nom_festival Processed_Type Processed_Region  \
39    FEST_44109_4308      scènes vagabondes        musique       pays loire   
1212  FEST_44109_4307         rendez l'erdre        musique       pays loire   
764   FEST_44109_4288           heures d'été        musique       pays loire   
1733  FEST_44109_4314             paco tyson        musique       pays loire   

     Processed_Ville Annee                                    Procced_musique  \
39            nantes  2015                                                NaN   
1212          nantes  1987                                                NaN   
764           nantes  2005                                     musiques monde   
1733          nantes  2017  hip-hop , rap , slam , musiques électroniques ...   

     Processed_Spectacle_vivant Processed_Cinema_audiovisuel  \
39                          NaN                          NaN   
1212                        NaN                          NaN   
7