# Imports

In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Import essays

In [3]:
import pandas as pd

# Load the dataset
file_path = 'ielts_writing_dataset.csv'  # Make sure the path is correct
dataset = pd.read_csv(file_path)

# Select only the 'Essay' and 'Overall' columns
essays = dataset['Essay']
scores = dataset['Overall']


# Data Preprocessing

In [4]:
# Parameters
vocab_size = 10000  # Adjust as needed
max_length = 300    # Adjust based on your essay length
padding_type = 'post'
trunc_type = 'post'

# Tokenizing the essays
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(essays)
word_index = tokenizer.word_index

# Converting essays to sequences
sequences = tokenizer.texts_to_sequences(essays)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Build

In [5]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Single output node for regression
])

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 64)           640000    
                                                                 
 bidirectional (Bidirection  (None, 128)               66048     
 al)                                                             
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 714369 (2.73 MB)
Trainable params: 714369 (2.73 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Training

In [14]:
# Convert scores to numpy array
scores = np.array(scores)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
train_padded, test_padded, train_scores, test_scores = train_test_split(padded, scores, test_size=0.2)

# Train the model
num_epochs = 5  # Adjust as needed
model.fit(train_padded, train_scores, epochs=num_epochs, validation_data=(test_padded, test_scores))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x290d22790>

# Evaluation

In [15]:
loss, accuracy = model.evaluate(test_padded, test_scores)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Loss: 0.37680962681770325, Accuracy: 0.3808192312717438


# Save the model

In [17]:
model.save('model.h5')

  saving_api.save_model(


# Load and test the model

In [18]:
model = tf.keras.models.load_model('model.h5')

# Predict the rating of essays

In [19]:
def preprocess_new_essays(new_essays, tokenizer, max_length):
    """
    Preprocess new essays using the same tokenizer and parameters used for training data.
    """
    # Convert essays to sequences
    sequences = tokenizer.texts_to_sequences(new_essays)
    # Pad the sequences to ensure uniform length
    padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
    return padded

def predict_scores(new_essays, tokenizer, model, max_length):
    """
    Predict scores for new essays.
    """
    # Preprocess essays
    preprocessed_essays = preprocess_new_essays(new_essays, tokenizer, max_length)
    # Predict scores
    predicted_scores = model.predict(preprocessed_essays)
    return predicted_scores

# Example usage
new_essays = ["Essay text goes here", "Essay 2 text goes here"]
predicted_scores = predict_scores(new_essays, tokenizer, model, max_length)

# Displaying predicted scores
for essay, score in zip(new_essays, predicted_scores):
    print(f"Essay: {essay}\nPredicted Score: {score}\n")


Essay: Between 1995 and 2010, a study was conducted representing the percentages of people born in Australia, versus people born outside Australia, living in urban, rural, and town. First, in 1995, cities represented the major percentage of habitat by roughly 50 percent, followed by rural areas and towns came in last, among people born in Australia. On the other hand, people born outside Australia, cities showed the most percentages of 6o percent, followed by rural areas and towns. In 2010, among people born in Australia, cities had an increase more than 20 percent increase in the total representation and a major decrease in towns and rural areas. Conversely, people born outside Australia, cities had the most percentage among both studies, followed by rural areas and towns.
Predicted Score: [5.4486704]

Essay: International sports events require the most well-trained athletes for each country, in order to achieve this goal countries make an effort to build infrastructure designed to tr