In [18]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.utils import resample
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler


In [8]:
df = pd.read_csv('../Data/CleanedData.csv')

In [9]:
label_encoder = LabelEncoder()
df['subject'] = label_encoder.fit_transform(df['subject'])

In [10]:
X = df['transformed text']
y = df['subject']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
max_words = 10000  # Maximum number of words to keep in the vocabulary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)


In [13]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
max_sequence_length = 100  # You can adjust this to your desired sequence length
X_train = pad_sequences(X_train, maxlen=max_sequence_length)
X_test = pad_sequences(X_test, maxlen=max_sequence_length)

In [19]:
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

In [20]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
model.add(SimpleRNN(64, return_sequences=True))
model.add(SimpleRNN(64, return_sequences=True))
model.add(SimpleRNN(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(6, activation='softmax'))  # 6 output classes


2023-09-26 13:14:47.852755: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-09-26 13:14:48.046054: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-09-26 13:14:48.046330: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-09-26 13:14:48.046772: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [22]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [23]:
model.fit(X_train_resampled, y_train_resampled, validation_data=(X_test, y_test), epochs=20, batch_size=32, callbacks=[early_stopping])

Epoch 1/20


2023-09-26 13:15:32.605932: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 33804000 exceeds 10% of free system memory.
2023-09-26 13:15:34.169293: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-09-26 13:15:34.321881: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f938c0374d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-09-26 13:15:34.321914: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2023-09-26 13:15:34.325873: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-09-26 13:15:34.392009: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or dir

Epoch 2/20
Epoch 3/20
Epoch 4/20


<keras.callbacks.History at 0x7f94bdc79b90>

In [25]:
y_pred_probabilities = model.predict(X_test)
y_pred = np.argmax(y_pred_probabilities, axis=1)
f1 = f1_score(y_test, y_pred, average='weighted')
classification_rep = classification_report(y_test, y_pred)

print(f'Weighted F1 Score: {f1}')
print(classification_rep)

Weighted F1 Score: 0.6281729071323268
              precision    recall  f1-score   support

           0       0.03      0.05      0.04       287
           1       0.79      0.74      0.77      1876
           2       0.79      0.52      0.62       165
           3       0.20      0.20      0.20       835
           4       0.66      0.55      0.60      3475
           5       0.71      0.96      0.82      1987

    accuracy                           0.63      8625
   macro avg       0.53      0.50      0.51      8625
weighted avg       0.64      0.63      0.63      8625

