# Imports

In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Import essays

In [3]:
import pandas as pd

# Load the dataset
file_path = 'ielts_writing_dataset.csv'  # Make sure the path is correct
dataset = pd.read_csv(file_path)

# Select only the 'Essay' and 'Overall' columns
essays = dataset['Essay']
scores = dataset['Overall']


# Data Preprocessing

In [4]:
# Parameters
vocab_size = 10000  # Adjust as needed
max_length = 300    # Adjust based on your essay length
padding_type = 'post'
trunc_type = 'post'

# Tokenizing the essays
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(essays)
word_index = tokenizer.word_index

# Converting essays to sequences
sequences = tokenizer.texts_to_sequences(essays)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Build

In [5]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Single output node for regression
])

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 64)           640000    
                                                                 
 bidirectional (Bidirection  (None, 128)               66048     
 al)                                                             
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 714369 (2.73 MB)
Trainable params: 714369 (2.73 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Training

In [8]:
# Convert scores to numpy array
scores = np.array(scores)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
train_padded, test_padded, train_scores, test_scores = train_test_split(padded, scores, test_size=0.2)

# Train the model
num_epochs = 10  # Adjust as needed
model.fit(train_padded, train_scores, epochs=num_epochs, validation_data=(test_padded, test_scores))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x28d58fc40>

# Evaluation

In [9]:
loss, accuracy = model.evaluate(test_padded, test_scores)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Loss: 1.0347354412078857, Accuracy: 0.7541486620903015
