# Imports

In [52]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



In [30]:
data = pd.read_csv('Emotion_final.csv')

In [37]:
# nltk.download('stopwords') Charger les StopWords dans NLTK
# nltk.download('punkt') Charger la punctuation dans NLTK

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/apprenant/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Consignes

A partir du jeu de données et à l'aide la librairie NLTK ou de la librairie Spacy, vous pouvez même essayez texthero:
- étudier la répartition des textes par émotions
- identifiez quels mots sont susceptibles d'être des stopword
- pour chaque sentiment, identifiez les 30 mots les plus courants pour chaque sentiment en dehors des stopwords
- A partir de ces 30 mots, définissez une métrique de proximité entre les sentiments et affichez-la sur une matrice type heatmap.
- Créer deux premiers modèles (Bag of words et TF IDF) en gérant les étapes de tokenisation, de gestion de la ponctuation, des émojis, des stopwords, de lemmatisation ou de streaming.

Le rendu du premier de temps se fera dans un notebook.


# Appercu

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21459 entries, 0 to 21458
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     21459 non-null  object
 1   Emotion  21459 non-null  object
dtypes: object(2)
memory usage: 335.4+ KB


In [14]:
data.describe()

Unnamed: 0,Text,Emotion
count,21459,21459
unique,21405,6
top,i feel so tortured by it,happy
freq,2,7029


In [6]:
data.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [7]:
data.nunique()

Text       21405
Emotion        6
dtype: int64

In [10]:
data['Emotion'].unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'happy'],
      dtype=object)

# Analyse

### Etudier la répartition des textes par émotions

In [15]:
grouped = data.groupby("Emotion")

In [21]:
counts = grouped.count()

In [27]:
percentage = counts / len(data)

In [28]:
percentage.round(2)

Unnamed: 0_level_0,Text
Emotion,Unnamed: 1_level_1
anger,0.14
fear,0.12
happy,0.33
love,0.08
sadness,0.29
surprise,0.04


### Identifiez quels mots sont susceptibles d'être des stopword

In [31]:
all_text = ' '.join(data['Text'].tolist())


In [35]:
tokens = word_tokenize(all_text)


In [38]:
stopwords_list = set(stopwords.words('english'))  

# Remplacez 'langue' par la langue appropriée (par exemple, 'english' pour l'anglais)


In [39]:
suspected_stopwords = [word.lower() for word in tokens if word.lower() in stopwords_list]


In [46]:
word_counts = pd.Series(suspected_stopwords).value_counts()

word_counts

i             32529
and           12721
to            11835
the           11808
a              8333
              ...  
ain               3
theirs            3
ours              2
weren             2
yourselves        2
Name: count, Length: 149, dtype: int64

### Pour chaque sentiment : 
 - identifiez les 30 mots les plus courants en dehors des stopwords

In [75]:
def get_top_words(text, n=30):
    tokens = word_tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stopwords_list]
    word_counts = pd.Series(filtered_tokens).value_counts().rename_axis('Mots').reset_index(name='Occurrences')
    top_words = word_counts.head(n)
    return top_words

# Créer un dictionnaire pour stocker les données des émotions
emotion_data = {}

# Itérer sur chaque émotion et obtenir les mots les plus courants
for emotion in data['Emotion'].unique():
    group = data[data['Emotion'] == emotion]
    top_words = get_top_words(' '.join(group['Text'].tolist()))
    emotion_data[emotion] = top_words['Mots']

# Créer un DataFrame à partir du dictionnaire des données d'émotion
text_top30_words = pd.DataFrame(emotion_data)

# Transposer le DataFrame pour échanger les index et les colonnes
text_top30_words = text_top30_words.transpose()

# Renommer les colonnes de 1 à 30 avec les numéros correspondants
text_top30_words.columns = range(1, 31)

# Afficher le DataFrame
text_top30_words

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
sadness,feel,feeling,like,im,",",really,know,get,would,time,...,things,much,dont,make,day,something,back,going,way,could
anger,feel,feeling,like,im,",",really,get,people,know,time,...,way,offended,dont,resentful,cold,something,still,irritable,cant,going
love,feel,feeling,like,im,love,really,know,sweet,time,loving,...,little,feelings,tender,longing,lovely,loved,would,need,accepted,horny
surprise,feel,feeling,",",like,im,amazed,curious,impressed,overwhelmed,surprised,...,stunned,bit,people,know,would,think,one,feels,much,could
fear,feel,feeling,im,like,",",little,bit,know,really,anxious,...,going,terrified,afraid,ive,agitated,scared,frightened,go,weird,things
happy,feel,feeling,like,im,really,time,know,get,",",make,...,life,love,even,going,ive,happy,day,could,still,well


In [77]:
def get_top_words(text, n=30):
    tokens = word_tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stopwords_list]
    word_counts = pd.Series(filtered_tokens).value_counts().rename_axis('Mots').reset_index(name='Occurrences')
    top_words = word_counts.head(n)
    return top_words

emotion_data = {}

# Itérer sur chaque émotion et obtenir les mots les plus courants
for emotion in data['Emotion'].unique():
    group = data[data['Emotion'] == emotion]
    top_words = get_top_words(' '.join(group['Text'].tolist()))
    emotion_data[emotion] = top_words['Occurrences']

# Créer un DataFrame à partir du dictionnaire des données d'émotion
number_top30_words = pd.DataFrame(emotion_data)

number_top30_words = number_top30_words.transpose()

# Afficher le DataFrame
number_top30_words 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
sadness,4095,1924,1078,875,447,352,344,289,273,270,...,194,191,190,183,177,170,167,164,162,159
anger,1803,923,479,426,248,171,154,150,149,146,...,102,92,91,87,86,85,85,84,83,81
love,1180,471,393,251,119,112,101,84,82,82,...,67,66,64,63,63,62,62,61,60,59
surprise,464,261,138,121,113,83,72,72,69,68,...,43,40,37,35,35,33,30,30,29,28
fear,1483,926,383,322,230,194,143,139,129,126,...,89,89,85,85,84,83,82,81,79,77
happy,4948,1956,1266,1007,381,370,323,311,276,276,...,235,231,223,219,216,212,206,206,201,187
