In [46]:
import pandas as pd
# import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import keras
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [47]:
print(tf.__version__)
print(keras.__version__)

2.18.0
3.7.0


## Dataset

In [48]:
train = pd.read_table('train.txt', delimiter=';', header=None)
test = pd.read_table('test.txt', delimiter=';', header=None)
val = pd.read_table('val.txt', delimiter=';', header=None)

In [49]:
data = pd.concat([train, val, test])

In [50]:
data.head()

Unnamed: 0,0,1
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [51]:
data.columns=["text", "label"]

In [52]:
data.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [53]:
data.label.unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [54]:
data.describe()

Unnamed: 0,text,label
count,20000,20000
unique,19948,6
top,i have chose for myself that makes me feel ama...,joy
freq,2,6761


In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    20000 non-null  object
 1   label   20000 non-null  object
dtypes: object(2)
memory usage: 468.8+ KB


In [56]:
data.isna().any(axis=1).sum()

np.int64(0)

In [57]:
data.drop_duplicates(inplace=True)

In [58]:
data.text[0]

0                              i didnt feel humiliated
0    im feeling quite sad and sorry for myself but ...
0    im feeling rather rotten so im not very ambiti...
Name: text, dtype: object

In [59]:
stemmer = PorterStemmer()

In [60]:
def preprocess(line):
    # Enlever les caractères spéciaux et les chiffres
    line = re.sub('[^a-zA-Z]', ' ', line)
    
    # Convertir en miniscule
    line = line.lower()
    
    # Tokenisation
    tokenized_line = word_tokenize(line, language='english')
    
    data = [stemmer.stem(word) for word in tokenized_line if not word in stopwords.words('english')]
    return " ".join(data)

In [61]:
data.text=data['text'].apply(lambda x: preprocess(x))

In [62]:
data.head(4)

Unnamed: 0,text,label
0,didnt feel humili,sadness
1,go feel hopeless damn hope around someon care ...,sadness
2,im grab minut post feel greedi wrong,anger
3,ever feel nostalg fireplac know still properti,love


In [63]:
lemmatizer = WordNetLemmatizer()

In [64]:
def preprocess2(line):
    # Enlever les caractères spéciaux et les chiffres
    line = re.sub('[^a-zA-Z]', ' ', line)
    
    # Convertir en miniscule
    line = line.lower()
    
    # Tokenisation
    tokenized_line = word_tokenize(line, language='english')
    
    # Lemmatisation
    data = [lemmatizer.lemmatize(word) for word in tokenized_line if not word in stopwords.words('english')]
    return " ".join(data)

### Algorithme de classification

In [65]:
from sklearn import preprocessing

In [66]:
label_encoder = preprocessing.LabelEncoder()
data['enco_lable'] = label_encoder.fit_transform(data['label'])

In [67]:
data.head(50)

Unnamed: 0,text,label,enco_lable
0,didnt feel humili,sadness,4
1,go feel hopeless damn hope around someon care ...,sadness,4
2,im grab minut post feel greedi wrong,anger,0
3,ever feel nostalg fireplac know still properti,love,3
4,feel grouchi,anger,0
5,ive feel littl burden late wasnt sure,sadness,4
6,ive take milligram time recommend amount ive f...,surprise,5
7,feel confus life teenag jade year old man,fear,1
8,petrona year feel petrona perform well made hu...,joy,2
9,feel romant,love,3


### Create corpus

In [68]:
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
vectorizer = CountVectorizer(max_features=5000)
data_vector = vectorizer.fit_transform(data['text']).toarray()

In [70]:
data_vector[0]

array([0, 0, 0, ..., 0, 0, 0])

In [71]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data_vector, data['enco_lable'], test_size=0.2, random_state=42)

In [72]:
X_train.shape

(15999, 5000)

## Deep Learning (NN)

In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

In [74]:
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(12,activation = 'relu'))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(6, activation="softmax"))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, Y_train, epochs=5, batch_size=5)

Epoch 1/5
[1m3200/3200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.5090 - loss: 1.2756
Epoch 2/5
[1m3200/3200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8911 - loss: 0.3279
Epoch 3/5
[1m3200/3200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9375 - loss: 0.1886
Epoch 4/5
[1m3200/3200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9556 - loss: 0.1373
Epoch 5/5
[1m3200/3200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.9660 - loss: 0.0977


<keras.src.callbacks.history.History at 0x1a5f40f8080>

In [75]:
_,accuracy = model.evaluate(X_test, Y_test)
print(accuracy)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8494 - loss: 0.5292
0.8360000252723694


In [76]:
text = "i'm bad"
text = preprocess(text)
text1 = vectorizer.transform([text]).toarray()
a = model.predict(text1)
a = np.argmax(a, axis=1)
label_encoder.inverse_transform(a)[0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step


'sadness'