#### 3. Обробка текстових даних 
Завантажте набір текстових даних (з мітками класів). Проведіть передобробку даних (видаліть стоп-слова, пунктуацію), за допомогою wordcloud зробіть візуалізацію найбільш поширених слів або n-gram у кожному класі. Векторизуйте тексти (наприклад за допомогою sklearn.feature_extraction.text. TfidfVectorizer). Проведіть класифікацію текстових даних, зробіть оцінку якості.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# supress warnings
import warnings 
warnings.filterwarnings('ignore')

In [4]:
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords

In [5]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
go_emotions = pd.read_parquet(r"train-00000-of-00001.parquet")
go_emotions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211225 entries, 0 to 211224
Data columns (total 37 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   text                  211225 non-null  object 
 1   id                    211225 non-null  object 
 2   author                211225 non-null  object 
 3   subreddit             211225 non-null  object 
 4   link_id               211225 non-null  object 
 5   parent_id             211225 non-null  object 
 6   created_utc           211225 non-null  float32
 7   rater_id              211225 non-null  int32  
 8   example_very_unclear  211225 non-null  bool   
 9   admiration            211225 non-null  int32  
 10  amusement             211225 non-null  int32  
 11  anger                 211225 non-null  int32  
 12  annoyance             211225 non-null  int32  
 13  approval              211225 non-null  int32  
 14  caring                211225 non-null  int32  
 15  

In [7]:
# Preprocess text: remove stopwords, punctuation, and lowercase
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

go_emotions['clean_text'] = go_emotions['text'].apply(preprocess_text)

In [8]:

# # Create word clouds for each emotion
# emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

# # Create word clouds for each emotion
# for emotion in emotions[:5]:
#     text = " ".join(go_emotions[go_emotions[emotion] == 1]['clean_text'])
#     wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
#     plt.figure(figsize=(10, 5))
#     plt.imshow(wordcloud, interpolation='bilinear')
#     plt.axis('off')
#     plt.title(f"Word Cloud for {emotion}")
#     plt.show()


In [14]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(go_emotions['clean_text'])

# Dictionary to store F1-scores for each emotion
f1_scores = {}

# Loop through each emotion and perform classification
for emotion in emotions:
    print(f"Classifying emotion: {emotion}")
    
    # Define the target variable
    y = go_emotions[emotion]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a classifier
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = classifier.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    f1_scores[emotion] = f1
    print(f"F1-score for emotion '{emotion}': {f1:.4f}")
    
    # Print classification report
    print(f"Classification report for emotion '{emotion}':")
    print(classification_report(y_test, y_pred))
    
    # Print confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix for {emotion}:")
    print(cm)
    
    print("\n" + "="*80 + "\n")

# Sort emotions by F1-score
sorted_f1_scores = sorted(f1_scores.items(), key=lambda item: item[1], reverse=True)

# Print summary
print("Summary of Emotion Classification:")
print("Best Classified Emotions:")
for emotion, score in sorted_f1_scores[:5]:
    print(f"{emotion}: {score:.4f}")

print("\nWorst Classified Emotions:")
for emotion, score in sorted_f1_scores[-5:]:
    print(f"{emotion}: {score:.4f}")

# Analysis of why some emotions are classified better or worse
print("\nAnalysis:")
print("Emotions with higher F1-scores likely have more distinct and consistent patterns in the text data.")
print("Emotions with lower F1-scores may have overlapping features with other emotions or fewer examples in the dataset.")


Classifying emotion: admiration
F1-score for emotion 'admiration': 0.9147
Classification report for emotion 'admiration':
              precision    recall  f1-score   support

           0       0.94      0.99      0.96     38789
           1       0.66      0.27      0.38      3456

    accuracy                           0.93     42245
   macro avg       0.80      0.63      0.67     42245
weighted avg       0.92      0.93      0.91     42245

Confusion Matrix for admiration:
[[38324   465]
 [ 2535   921]]


Classifying emotion: amusement
F1-score for emotion 'amusement': 0.9531
Classification report for emotion 'amusement':
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     40354
           1       0.59      0.30      0.40      1891

    accuracy                           0.96     42245
   macro avg       0.78      0.65      0.69     42245
weighted avg       0.95      0.96      0.95     42245

Confusion Matrix for amusement:
[[39968

### 3. Рекурентні нейронні мережі
Вирішіть задачу класифікації текстів (з якими ви працювали в лабораторній № 2) за допомогою рекурентної нейромережі двома способами: <br>
а) навчить мережу і embedding шар з нуля (from scratch) <br>
б) використовуючи pretrained word embeddings <br>

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder


Decided to train model to classify neutral and non-neutral emotions

In [None]:

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(go_emotions['neutral'])

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(go_emotions['clean_text'])
sequences = tokenizer.texts_to_sequences(go_emotions['clean_text'])
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

# Split data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y_encoded, test_size=0.2, random_state=42)


In [22]:

# Model 1: Train embedding layer from scratch
model_scratch = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    SpatialDropout1D(0.2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model_scratch.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model_scratch.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))


Epoch 1/5
[1m2641/2641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 59ms/step - accuracy: 0.7379 - loss: 0.5766 - val_accuracy: 0.7352 - val_loss: 0.5780
Epoch 2/5
[1m2641/2641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 64ms/step - accuracy: 0.7415 - loss: 0.5718 - val_accuracy: 0.7352 - val_loss: 0.5782
Epoch 3/5
[1m2641/2641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 64ms/step - accuracy: 0.7373 - loss: 0.5763 - val_accuracy: 0.7352 - val_loss: 0.5780
Epoch 4/5
[1m2641/2641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 69ms/step - accuracy: 0.7402 - loss: 0.5731 - val_accuracy: 0.7352 - val_loss: 0.5781
Epoch 5/5
[1m2641/2641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 68ms/step - accuracy: 0.7377 - loss: 0.5756 - val_accuracy: 0.7352 - val_loss: 0.5781


<keras.src.callbacks.history.History at 0x255175cce50>

In [23]:

# Model 2: Use pretrained word embeddings (e.g., GloVe)
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embedding_matrix = np.zeros((10000, 100))
for word, i in tokenizer.word_index.items():
    if i < 10000:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

model_pretrained = Sequential([
    Embedding(input_dim=10000, output_dim=100, weights=[embedding_matrix], input_length=100, trainable=False),
    SpatialDropout1D(0.2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model_pretrained.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model_pretrained.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))


Epoch 1/5
[1m2641/2641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 51ms/step - accuracy: 0.7394 - loss: 0.5788 - val_accuracy: 0.7352 - val_loss: 0.5781
Epoch 2/5
[1m2641/2641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 52ms/step - accuracy: 0.7379 - loss: 0.5757 - val_accuracy: 0.7352 - val_loss: 0.5785
Epoch 3/5
[1m2641/2641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 53ms/step - accuracy: 0.7407 - loss: 0.5726 - val_accuracy: 0.7352 - val_loss: 0.5780
Epoch 4/5
[1m2641/2641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 62ms/step - accuracy: 0.7393 - loss: 0.5740 - val_accuracy: 0.7352 - val_loss: 0.5782
Epoch 5/5
[1m2641/2641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 56ms/step - accuracy: 0.7400 - loss: 0.5733 - val_accuracy: 0.7352 - val_loss: 0.5780


<keras.src.callbacks.history.History at 0x255182973d0>

In [24]:

# Compare results
loss_scratch, accuracy_scratch = model_scratch.evaluate(X_test, y_test)
loss_pretrained, accuracy_pretrained = model_pretrained.evaluate(X_test, y_test)

print(f"Model trained from scratch - Loss: {loss_scratch}, Accuracy: {accuracy_scratch}")
print(f"Model with pretrained embeddings - Loss: {loss_pretrained}, Accuracy: {accuracy_pretrained}")

[1m1321/1321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 12ms/step - accuracy: 0.7328 - loss: 0.5807
[1m1321/1321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - accuracy: 0.7328 - loss: 0.5805
Model trained from scratch - Loss: 0.5780812501907349, Accuracy: 0.7352349162101746
Model with pretrained embeddings - Loss: 0.5780116319656372, Accuracy: 0.7352349162101746
