In [2]:
import pandas as pd 

df = pd.read_csv('Emotion_final.csv')

In [3]:
import pickle

# Charger le modèle TF-IDF entraîné
with open('model_bow.pkl', 'rb') as f:
    model_bow = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [48]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.util import mark_negation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import FreqDist
import string

# Télécharger les ressources nécessaires pour NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Prétraitement du corpus de texte
all_text_corpus = ' '.join(df['Text'])
tokens = word_tokenize(all_text_corpus)

# Conversion en minuscules et suppression de la ponctuation
tokens = [token.lower() for token in tokens if token not in string.punctuation]

# Lemmatization
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]

# Filtrage des stopwords
stopwords_list = stopwords.words('english')
tokens = [token for token in tokens if token not in stopwords_list]

word_freq = FreqDist(tokens)

# Sélectionner les mots à exclure qui apparaissent plus de 100 fois dans le corpus
common_words_to_exclude = [word for word, count in word_freq.items() if count > 100]


# Étape de prétraitement : tokenization, gestion de la ponctuation, des émojis, des stopwords, lemmatisation et streaming

def preprocess_text(text):
    # Tokenization : divise le texte en unités linguistiques
    tokens = word_tokenize(text)
    
    # Gestion de la ponctuation et des émojis
    tokens = [token for token in tokens if token.isalnum()]
    
    # Gestion des stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Lemmatisation
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # tokens = [word for word in words if word.lower() not in common_words_to_exclude]
    
    # Streaming pour marquer les négations (ex : "not happy" -> "not_happy")
    tokens = mark_negation(tokens)
    
    # Rejoindre les tokens en une chaîne de texte
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


[nltk_data] Downloading package punkt to /home/apprenant/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/apprenant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/apprenant/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [46]:
from elasticsearch import Elasticsearch
from faker import Faker
import random

# Créer une instance du client Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme':'http'}])

# Définir le mapping pour l'index "notes"
mapping = {
  "properties": {
    "patient_lastname": {
      "type": "keyword"
    },
    "patient_firstname": {
      "type": "keyword"
    },
    "text": {
      "type": "text",
      "analyzer": "standard"
    },
    "date": {
      "type": "date"
    },
    "patient_left": {
      "type": "boolean"
    },
    "emotion": {
      "type": "keyword"
    },
    "confidence": {
      "type": "float"
    }
  }
}

# Créer l'index "notes" avec le mapping
es.indices.create(index='notes', body={"mappings": mapping})

# Prétraitement du corpus de texte
preprocessed_text = df['Text'].apply(preprocess_text)

# Création du modèle Bag of Words
bow_vectorizer = CountVectorizer()
bow_features = bow_vectorizer.fit_transform(preprocessed_text)

# Générer les données fictives avec Faker et les insérer dans l'index "notes"
fake = Faker()
for index,row in df.iterrows():
  # Obtenir la représentation vectorielle de la ligne actuelle
  row_vector = bow_features[index]
    
  # Prédire l'émotion pour la représentation vectorielle de la ligne actuelle
  emotion = model_bow.predict(row_vector)[0]  # Utilisation de [0] pour obtenir la prédiction unique
  
  # Obtenir le score de précision du modèle pour la représentation vectorielle de la ligne actuelle
  confidence = model_bow.predict_proba(row_vector).max()
  
  doc = {
        "patient_lastname": fake.last_name(),
        "patient_firstname": fake.first_name(),
        "text": row['Text'],
        "date": fake.date_this_decade(),
        "patient_left": random.choice([True, False]),
        "emotion": emotion,
        "confidence": confidence
    }
  es.index(index='notes', body=doc)

# Rafraîchir l'index pour rendre les données disponibles pour la recherche
es.indices.refresh(index='notes')


  es.indices.create(index='notes', body={"mappings": mapping})
  es.indices.create(index='notes', body={"mappings": mapping})
  es.index(index='notes', body=doc)
  es.index(index='notes', body=doc)
  es.indices.refresh(index='notes')


ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

In [47]:
from elasticsearch import Elasticsearch

# Créer une instance du client Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}])

# Effectuer une recherche sur l'index "notes"
search_results = es.search(index='notes', size=10)  # Récupérer les 10 premiers documents

# Parcourir les résultats de la recherche
for hit in search_results['hits']['hits']:
    # Récupérer le document
    document = hit['_source']
    
    # Afficher les informations du document
    print("Patient: {} {}".format(document['patient_firstname'], document['patient_lastname']))
    print("Date: {}".format(document['date']))
    print("Text: {}".format(document['text']))
    print("Emotion: {}".format(document['emotion']))
    print("Confidence: {}".format(document['confidence']))
    print("-----")


Patient: Sean Lynn
Date: 2022-12-11
Text: i feel that at shows and around show horses people are trusting and relaxed because most show horses are safe and quiet and are handled frequently
Emotion: happy
Confidence: 0.9953355012559577
-----
Patient: Justin Alvarado
Date: 2023-05-02
Text: i only have a couple of things left to make and at the start of december i am done and feeling smug
Emotion: happy
Confidence: 0.9585413633790166
-----
Patient: Debra Mckenzie
Date: 2020-05-25
Text: i think about how u could make me feel and realize that everything will be ok
Emotion: happy
Confidence: 0.9396892251834823
-----
Patient: Kyle Martinez
Date: 2021-03-24
Text: i feel so worthless during those times i was struggling finding work
Emotion: sadness
Confidence: 0.8353849868281406
-----
Patient: Matthew Hines
Date: 2022-01-25
Text: i will be able to lay on my bed in the dark and not feel terrified at least for a while
Emotion: fear
Confidence: 0.9646014603162067
-----
Patient: Dawn Webb
Date: 202

  search_results = es.search(index='notes', size=10)  # Récupérer les 10 premiers documents


In [None]:
from elasticsearch import Elasticsearch

# Se connecter à Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme':'http'}])

# Supprimer l'index "notes"
es.indices.delete(index='notes', ignore=[400, 404])

print("Index 'notes' supprimé avec succès.")


Index 'notes' supprimé avec succès.


  es.indices.delete(index='notes', ignore=[400, 404])
  es.indices.delete(index='notes', ignore=[400, 404])


In [52]:
from elasticsearch import Elasticsearch

# Créez une instance de connexion Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme':'http'}])

# Exemple de requête de recherche
query = {
  "query": {
    "match": {
      "patient_lastname": "Hines"
    }
  }
}

# Envoyer la requête de recherche à Elasticsearch
response = es.search(index='notes', body=query)

# Traiter la réponse renvoyée par Elasticsearch
for hit in response['hits']['hits']:
    print(hit['_source'])

{'patient_lastname': 'Hines', 'patient_firstname': 'Matthew', 'text': 'i will be able to lay on my bed in the dark and not feel terrified at least for a while', 'date': '2022-01-25', 'patient_left': False, 'emotion': 'fear', 'confidence': 0.9646014603162067}
{'patient_lastname': 'Hines', 'patient_firstname': 'Debbie', 'text': 'i like to finish on a positive note that whenever i feel a bit fearful or down i can just remember something nice about me and rich and it cheers me up', 'date': '2022-10-21', 'patient_left': True, 'emotion': 'fear', 'confidence': 0.5474559342331309}
{'patient_lastname': 'Hines', 'patient_firstname': 'Leslie', 'text': 'i tell the people closest to me things that i am feeling and its as if they arent surprised because theyd known it all along', 'date': '2020-11-01', 'patient_left': True, 'emotion': 'surprise', 'confidence': 0.7333758621951966}
{'patient_lastname': 'Hines', 'patient_firstname': 'Jenna', 'text': 'i was feeling helpless as i could not explain it to h

  response = es.search(index='notes', body=query)
  response = es.search(index='notes', body=query)


In [53]:
from elasticsearch import Elasticsearch
import pandas as pd

# Créez une instance de connexion Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme':'http'}])

# Exemple de requête de recherche
query = {
  "query": {
    "match_all": {}  # Correspondance avec tous les documents
  }
}

# Envoyer la requête de recherche à Elasticsearch
response = es.search(index='notes', body=query)

# Liste pour stocker les données des patients et des émotions
patient_data = []

# Traiter la réponse renvoyée par Elasticsearch
for hit in response['hits']['hits']:
    source = hit['_source']
    patient = source['patient_lastname'] + ' ' + source['patient_firstname']
    emotion = source['emotion']
    patient_data.append({'Patient': patient, 'Emotion': emotion})

# Créer un DataFrame à partir des données des patients
df = pd.DataFrame(patient_data)

# Afficher le DataFrame
print(df)


             Patient  Emotion
0          Lynn Sean    happy
1    Alvarado Justin    happy
2     Mckenzie Debra    happy
3      Martinez Kyle  sadness
4      Hines Matthew     fear
5          Webb Dawn     love
6  Quinn Christopher    anger
7       Grant Martha  sadness
8     Conway Matthew    anger
9       Hudson Jerry     love


  response = es.search(index='notes', body=query)
  response = es.search(index='notes', body=query)
