In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import keras
import tensorflow as tf

In [2]:
import spacy

In [32]:
import pickle

In [3]:
df = pd.read_csv('text.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [4]:
df = df.drop('Unnamed: 0', axis = 1)
df.head()

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [5]:
df['label'].unique()

array([4, 0, 2, 1, 5, 3], dtype=int64)

In [6]:
df.isna().sum()

text     0
label    0
dtype: int64

In [7]:
df.duplicated().sum()

686

In [8]:
df = df.drop_duplicates()

In [9]:
emotion_map = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}

df['label'] = df['label'].map(emotion_map)

In [10]:
df.head()

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,fear
1,ive enjoyed being able to slouch about relax a...,sadness
2,i gave up my internship with the dmrg and am f...,fear
3,i dont know i feel so lost,sadness
4,i am a kindergarten teacher and i am thoroughl...,fear


In [11]:
nlp = spacy.load('en_core_web_sm')
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

print('hello')

hello


In [12]:
df['text'] = df['text'].apply(preprocess_text)
df = pd.get_dummies(df, columns=['label'], dtype=int)
df

Unnamed: 0,text,label_anger,label_fear,label_joy,label_love,label_sadness,label_surprise
0,feel helpless heavy hearted,0,1,0,0,0,0
1,ve enjoy able slouch relax unwind frankly need...,0,0,0,0,1,0
2,give internship dmrg feel distraught,0,1,0,0,0,0
3,not know feel lost,0,0,0,0,1,0
4,kindergarten teacher thoroughly weary job havi...,0,1,0,0,0,0
...,...,...,...,...,...,...,...
416804,feel like tell horny devil find site suited so...,0,0,0,1,0,0
416805,begin realize feel agitated restless thought dish,1,0,0,0,0,0
416806,feel curious previous early dawn time don t se...,0,0,0,0,0,1
416807,feel becuase tyranical nature government el sa...,1,0,0,0,0,0


In [13]:
print('hello')

hello


In [16]:
df.duplicated().sum()

0

In [15]:
df = df.drop_duplicates()

In [17]:
df.shape

(394107, 7)

In [22]:
df[df['text'].apply(lambda x: len(x.split()) == 0)]

Unnamed: 0,text,label_anger,label_fear,label_joy,label_love,label_sadness,label_surprise


In [21]:
df = df.drop(df[df['text'].apply(lambda x: len(x.split()) == 0)].index)

In [24]:
df.iloc[:, 1:].shape

(394103, 6)

In [26]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_valid, y_train, y_valid = train_test_split(df['text'], df.iloc[:, 1:], test_size = 0.1, random_state = 42)

In [30]:
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=0.5, random_state=42)

In [33]:
tokenizer_five = tf.keras.preprocessing.text.Tokenizer()
tokenizer_five.fit_on_texts(X_train)

In [34]:
with open('tokenizer_five.pkl', 'wb') as handle:
    pickle.dump(tokenizer_five, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [35]:
train_sequences = tokenizer_five.texts_to_sequences(X_train)
valid_sequences = tokenizer_five.texts_to_sequences(X_valid)
test_sequences = tokenizer_five.texts_to_sequences(X_test)
word_index = tokenizer_five.word_index
vocab_size = len(word_index) + 1

In [36]:
max_length = 40
padded_train_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_length, padding='post')
padded_valid_sequences = tf.keras.preprocessing.sequence.pad_sequences(valid_sequences, maxlen=max_length, padding='post')
padded_test_sequences = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [37]:
embedding_dim = 512
embedding_matrix = tf.keras.initializers.GlorotUniform()(shape=(vocab_size, embedding_dim))

In [43]:
from tensorflow.keras.layers import Input, Embedding, Bidirectional, GRU, LSTM, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.models import Model

input_text = Input(shape=(max_length,))
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=True)(input_text)
bidirectional_gru_1 = Bidirectional(GRU(64, return_sequences=True))(embedding)
bidirectional_lstm_1 = Bidirectional(LSTM(64, return_sequences=True))(bidirectional_gru_1)
bidirectional_gru_2 = Bidirectional(GRU(32, return_sequences=True))(bidirectional_lstm_1)
bidirectional_lstm_2 = Bidirectional(LSTM(32, return_sequences=True))(bidirectional_gru_2)
pooling = GlobalMaxPooling1D()(bidirectional_lstm_2)
dense_1 = Dense(64, activation='relu')(pooling)
dropout_1 = Dropout(0.5)(dense_1)
dense_2 = Dense(32, activation='relu')(dense_1)
dropout_2 = Dropout(0.25)(dense_2)
output = Dense(6, activation='softmax')(dropout_2)

model = Model(inputs=input_text, outputs=output)

model.compile(optimizer=tf.keras.optimizers.AdamW(), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()



In [44]:
def build_tf_dataset(features, labels, BATCH_SIZE):
    reviews = tf.convert_to_tensor(features)
    sentiments = tf.constant(labels)
    dataset = tf.data.Dataset.from_tensor_slices((features, labels))  
    dataset = dataset.cache()  
    dataset = dataset.shuffle(len(features)) 
    dataset = dataset.batch(BATCH_SIZE)  
    dataset = dataset.prefetch(tf.data.AUTOTUNE) 
    return dataset


In [45]:
train_dataset = build_tf_dataset(padded_train_sequences, y_train, 512)
valid_dataset = build_tf_dataset(padded_valid_sequences, y_valid, 512)
test_dataset = build_tf_dataset(padded_test_sequences, y_test, 512)

In [46]:
earlystopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=1,
    mode='min',
    min_delta=0.01,
    verbose=1,
    restore_best_weights=True
)

In [47]:
history = model.fit(train_dataset, validation_data = valid_dataset, epochs = 10, callbacks = [earlystopping])

Epoch 1/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m551s[0m 778ms/step - accuracy: 0.6487 - loss: 0.8772 - val_accuracy: 0.9248 - val_loss: 0.1558
Epoch 2/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m501s[0m 722ms/step - accuracy: 0.9284 - loss: 0.1463 - val_accuracy: 0.9243 - val_loss: 0.1422
Epoch 3/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m501s[0m 722ms/step - accuracy: 0.9327 - loss: 0.1232 - val_accuracy: 0.9282 - val_loss: 0.1404
Epoch 3: early stopping
Restoring model weights from the end of the best epoch: 2.


In [49]:
loss, accuracy = model.evaluate(test_dataset)
print(f"The model loss on test dataset is {round(loss, 4)}")
print(f"The model accuracy on test dataset is {round(accuracy, 2)*100}%")

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 160ms/step - accuracy: 0.9309 - loss: 0.1337
The model loss on test dataset is 0.1325
The model accuracy on test dataset is 93.0%


In [50]:
model.save("my_model_five.h5")



In [53]:
def classify():
    def preprocess(text, tokenizer_five, max_length):
        sequences = tokenizer_five.texts_to_sequences([text])
        padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post')
        return padded_sequences
        
    custom_text = input("How do you feel today?: ")
    text = custom_text
    #sentiment_scores = analyzer.polarity_scores(text)
    #print(sentiment_scores)
    #sentiment = 'positive' if sentiment_scores['compound'] >= 0.05 else 'negative' if sentiment_scores['compound'] <= -0.05 else 'neutral'
    #print(sentiment)

    padded_custom_text = preprocess(custom_text, tokenizer_five, max_length)

    predictions = model.predict(padded_custom_text)
    predicted_class = tf.argmax(predictions, axis=1).numpy()

    print(f"Predicted class: {predicted_class}")

    for idx, prob in enumerate(predictions[0]):
        print(f"Class {idx}: {prob:.4f}")

In [54]:
classify()

How do you feel today?:  i feel very angry


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 945ms/step
Predicted class: [0]
Class 0: 0.9995
Class 1: 0.0000
Class 2: 0.0000
Class 3: 0.0000
Class 4: 0.0004
Class 5: 0.0000
