In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Bidirectional, GRU, LSTM, Dense
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu

In [2]:
p_df = pd.read_csv('preprocessed_dataset.csv')

In [3]:
X = p_df['preprocessed_text'].values
y = p_df['score'].values

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [6]:
def create_rnn_model(input_length, vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=input_length))
    model.add(SimpleRNN(100))
    model.add(Dense(1))  # No activation for regression
    model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mae'])
    return model

In [7]:
def create_bi_rnn_model(input_length, vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=input_length))
    model.add(Bidirectional(SimpleRNN(100)))
    model.add(Dense(1))  # No activation for regression
    model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mae'])
    return model

In [8]:
def create_gru_model(input_length, vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=input_length))
    model.add(GRU(100))
    model.add(Dense(1))  # No activation for regression
    model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mae'])
    return model

In [9]:
def create_lstm_model(input_length, vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=input_length))
    model.add(LSTM(100))
    model.add(Dense(1))  # No activation for regression
    model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mae'])
    return model

In [10]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    early_stopping = EarlyStopping(monitor='val_loss', patience=3)
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, callbacks=[early_stopping])
    loss, mae = model.evaluate(X_test, y_test)
    predictions = model.predict(X_test)
    return predictions, loss, mae

In [11]:
input_length = X_train.shape[1]
vocab_size = len(tokenizer.word_index) + 1

# Create models
models = {
    'RNN': create_rnn_model(input_length, vocab_size),
    'Bidirectional RNN': create_bi_rnn_model(input_length, vocab_size),
    'GRU': create_gru_model(input_length, vocab_size),
    'LSTM': create_lstm_model(input_length, vocab_size)
}



In [13]:
results = {}
for name, model in models.items():
    print(f'Training {name} model...')
    predictions, loss, mae = train_and_evaluate(model, X_train, y_train, X_test, y_test)
    results[name] = {
        'predictions': predictions,
        'loss': loss,
        'mae': mae
    }
    # Calculate BLEU scores (assuming each target value is a sequence)
    bleu_scores = []
    for i in range(len(y_test)):
        reference = [str(y_test[i]).split()]  # Reference sequence
        candidate = str(predictions[i][0]).split()  # Predicted sequence
        bleu_score = sentence_bleu(reference, candidate)
        bleu_scores.append(bleu_score)
    results[name]['bleu'] = np.mean(bleu_scores)

Training RNN model...
Epoch 1/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - loss: 0.8907 - mae: 0.6162 - val_loss: 10.8043 - val_mae: 2.7430
Epoch 2/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 0.3559 - mae: 0.3985 - val_loss: 10.6414 - val_mae: 2.7641
Epoch 3/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step - loss: 0.3632 - mae: 0.3461 - val_loss: 10.5004 - val_mae: 2.7934
Epoch 4/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.1482 - mae: 0.2439 - val_loss: 10.3484 - val_mae: 2.8076
Epoch 5/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - loss: 0.1269 - mae: 0.2372 - val_loss: 10.2520 - val_mae: 2.7975
Epoch 6/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - loss: 0.1634 - mae: 0.2763 - val_loss: 10.2521 - val_mae: 2.7935
Epoch 7/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms

In [14]:
for name, result in results.items():
    print(f'\n{name} Model:')
    print(f'Loss: {result["loss"]}, MAE: {result["mae"]}, BLEU: {result["bleu"]}')
    print(f'Sample Predictions: {result["predictions"][:5].flatten()}, Actual: {y_test[:5]}')


RNN Model:
Loss: 10.241813659667969, MAE: 2.7883059978485107, BLEU: 0.0
Sample Predictions: [5.0885963 4.219697  5.0478888 7.044147  5.5901785], Actual: [7 0 0 7 6]

Bidirectional RNN Model:
Loss: 12.84733772277832, MAE: 3.1678125858306885, BLEU: 0.0
Sample Predictions: [5.07799   5.079793  4.9224935 5.9642425 3.2273867], Actual: [7 0 0 7 6]

GRU Model:
Loss: 6.0418500900268555, MAE: 1.9935212135314941, BLEU: 0.0
Sample Predictions: [7.581923  3.1084423 1.3054769 7.3645463 3.7163413], Actual: [7 0 0 7 6]

LSTM Model:
Loss: 5.387274265289307, MAE: 1.8389149904251099, BLEU: 0.0
Sample Predictions: [7.2656846 2.9552796 0.4695965 8.298201  4.043827 ], Actual: [7 0 0 7 6]
