## IMPORTATION DES BIBLIOTHEQUES

In [132]:
import re
import nltk
import pandas as pd
from pprint import pprint
from neattext import functions
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer

## TRAITEMENT DES DONNEES

In [133]:
# Importation du jeu de données
dataset = pd.read_csv("./dataset_police_nationale.csv")

In [134]:
# Fonction de récupération des dates d'au plus 2 ans
def select_dates():
    dataset["Dates"] = dataset["Dates"].apply(lambda date: date.replace("un", "1"))
    indices_to_keep = []
    indices_to_remove = []
    
    for indice in range(len(dataset["Dates"])):
        if "jours" in dataset["Dates"][indice] or "semaine" in dataset["Dates"][indice] or "mois" in dataset["Dates"][indice]:
            indices_to_keep.append(indice)
        elif "1 an" in dataset["Dates"][indice] or "2 ans" in dataset["Dates"][indice]:
            indices_to_keep.append(indice)
            
    for elt in range(len(dataset)):
        if elt not in indices_to_keep:
            indices_to_remove.append(elt)
            
    dataset.drop(indices_to_remove, axis=0, inplace=True)
    dataset.reset_index(drop=True, inplace=True)
    
    return dataset

In [135]:
# Fonction de nettoyage des données
def clean_avis(avis):
    stop_words = set(stopwords.words('french')) # Charger les mots vides (stop words)
    cleaned_avis = functions.remove_emojis(avis)
    cleaned_avis = re.sub(r'\W+', ' ', cleaned_avis.lower())  # Supprimer les caractères non alphabétiques et convertir en minuscules
    cleaned_avis = re.sub(r'\d+', '', cleaned_avis)  # Supprimer les chiffres
    cleaned_avis = ' '.join([word for word in cleaned_avis.split() if word not in stop_words])  # Supprimer les mots vides
    return cleaned_avis

In [136]:
# Sélectionner les données dont les dates n'excèdent pas 2 ans
dataset = select_dates()

# Nettoyer les avis du jeu de données
dataset["Processing"] = dataset["Avis"].apply(clean_avis)

# Tokenisation des avis
dataset["Processing"] = dataset["Processing"].apply(nltk.word_tokenize)

In [137]:
# Création d'une short list de mots négatifs
def create_short_lists():
    # Initialiser l'analyseur de sentiments
    sia = SentimentIntensityAnalyzer()

    # Liste pour stocker les mots très négatifs
    positive_short_list = []
    negative_short_list = []
    neutral_short_list = []

    # Parcourir les mots
    for indice in range(len(dataset["Processing"])):
        for word in dataset["Processing"][indice]:
            # Calculer la polarité du mot
            neutral_polarity = sia.polarity_scores(word)['neu']
            # Sélectionner les mots ayant une polarité négative élevée
            if neutral_polarity > 0:
                neutral_short_list.append(word)
            elif neutral_polarity == 0:
                negative_polarity = sia.polarity_scores(word)['neg']
                if negative_polarity > 0:
                    negative_short_list.append(word)
                else:
                    positive_short_list.append(word)
                
    return list(set(negative_short_list)), list(set(neutral_short_list)), list(set(positive_short_list))

words_short_list = create_short_lists()

In [139]:
for word in words_short_list[0]:
    if len(word) == 2 or word == "tout":
        words_short_list[0].remove(word)

In [140]:
# Attribuer des labels aux avis
labels_list = []
for word in dataset["Processing"].apply(lambda x: ' '.join(x)):
    # Recherche de mots clés négatifs dans le commentaire
    negative_word_found = re.findall(r'\b(?:{})\b'.format('|'.join(words_short_list[0])), word)
    neutral_word_found = re.findall(r'\b(?:{})\b'.format('|'.join(words_short_list[1])), word)
    positive_word_found = re.findall(r'\b(?:{})\b'.format('|'.join(words_short_list[2])), word)
    
    if negative_word_found:
        labels_list.append("Négatif")
    elif neutral_word_found:
        labels_list.append("Neutre")
    else:
        labels_list.append("Positif")

In [142]:
dataset["Labels"] = labels_list

In [144]:
indice_to_remove = []
for elt in range(len(dataset["Labels"])):
    if dataset["Labels"][elt] == "Positif":
        indice_to_remove.append(elt)
        
# Supprimer les commentaires contenant que les emojis
dataset.drop(indice_to_remove, axis=0, inplace=True)
dataset.reset_index(drop=True, inplace=True)

In [145]:
# Récupération des commentaires négatifs
indices_to_keep = []
indices_to_remove = []

for indice in range(len(dataset["Labels"])):
    if dataset["Labels"][indice] == "Négatif":
        indices_to_keep.append(indice)
        
for index in range(len(dataset["Labels"])):
    if index not in indices_to_keep:
        indices_to_remove.append(index)
        
dataset.drop(indices_to_remove, axis=0, inplace=True)
dataset.reset_index(drop=True, inplace=True)