# Model to rate essays based on the text provided

In [73]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [74]:
# Import dataset
dataset = pd.read_csv('ielts_writing_dataset.csv')

dataset.head()

Unnamed: 0,Task_Type,Question,Essay,Examiner_Commen,Task_Response,Coherence_Cohesion,Lexical_Resource,Range_Accuracy,Overall
0,1,The bar chart below describes some changes abo...,"Between 1995 and 2010, a study was conducted r...",,,,,,5.5
1,2,Rich countries often give money to poorer coun...,Poverty represents a worldwide crisis. It is t...,,,,,,6.5
2,1,The bar chart below describes some changes abo...,The left chart shows the population change hap...,,,,,,5.0
3,2,Rich countries often give money to poorer coun...,Human beings are facing many challenges nowada...,,,,,,5.5
4,1,The graph below shows the number of overseas v...,Information about the thousands of visits from...,,,,,,7.0


In [75]:
# Figure out all values of 'Overall' column

l = len(dataset['Overall'].unique())

# Lets figure out the average length of the essays in the dataset

a =  dataset['Essay'].str.split().str.len().mean()

l, a

(14, 256.7324041811847)

# Data Preprocessing

We need to remove unnecessary columns from our dataset. We will only keep the "Essay" and "Overall" columns. We will use 80% of the data for training and 20% for testing.

In [76]:
# Data Preprocessing

# Remove task type column

processed = dataset.drop(['Task_Type', 'Question', 'Task_Response', 'Examiner_Commen', 'Coherence_Cohesion', 'Lexical_Resource', 'Range_Accuracy'], axis=1)

processed.head()

Unnamed: 0,Essay,Overall
0,"Between 1995 and 2010, a study was conducted r...",5.5
1,Poverty represents a worldwide crisis. It is t...,6.5
2,The left chart shows the population change hap...,5.0
3,Human beings are facing many challenges nowada...,5.5
4,Information about the thousands of visits from...,7.0


In [80]:
# Separate the dataset into training and testing sets. We will use 80% of the data for training and 20% for testing.
# The column we want to predict is the 'Overall' column that contains the IELTS score.
# The column 'Essay' contains the essay text.
# We will use essay text as a feature to predict the score.

processed = processed.dropna()
X = processed['Essay']
y = processed['Overall']

# Split the dataset into training and testing sets using tensorflow
from sklearn.model_selection import train_test_split

(X_train, y_train, X_test, y_test) = train_test_split(X, y, test_size=0.2, random_state=0)

X_train.head()

1322    Raising the price of car fuel is believed by s...
609     Nowadays, with all the technology and studies ...
511     Change is considered a positive thing by some ...
877     The increasing number of criminals are appeare...
1274    The three pie graphs give information on the s...
Name: Essay, dtype: object

In [91]:
X_train_dataset = tf.data.Dataset.from_tensor_slices(X_train)
y_train_dataset = tf.data.Dataset.from_tensor_slices(y_train)

X_test_dataset = tf.data.Dataset.from_tensor_slices(X_test)
y_test_dataset = tf.data.Dataset.from_tensor_slices(y_test)

In [84]:
vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=20000,
    output_mode='int',
    output_sequence_length=200)

vectorize_layer.adapt(X_train_dataset)

# Lets see how the vectorization layer works

vectorize_layer.get_vocabulary()[0:10]

['', '[UNK]', 'the', 'to', 'of', 'in', 'and', 'a', 'is', 'that']

In [85]:
# Pad the sequences to have the same length
X_train_dataset = X_train_dataset.map(lambda x: vectorize_layer(x))
y_train_dataset = y_train_dataset.map(lambda x: vectorize_layer(x))

X_train_dataset = X_train_dataset.padded_batch(32)
y_train_dataset = y_train_dataset.padded_batch(32)


In [114]:
# Build the model
model = tf.keras.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string),
    vectorize_layer,
    tf.keras.layers.Embedding(20000, 256),
    tf.keras.layers.LSTM(1024),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(14, activation='softmax')
])

In [115]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_26 (Tex  (None, 200)               0         
 tVectorization)                                                 
                                                                 
 embedding_5 (Embedding)     (None, 200, 256)          5120000   
                                                                 
 lstm_5 (LSTM)               (None, 1024)              5246976   
                                                                 
 dense_11 (Dense)            (None, 128)               131200    
                                                                 
 dense_12 (Dense)            (None, 14)                1806      
                                                                 
Total params: 10499982 (40.05 MB)
Trainable params: 10499982 (40.05 MB)
Non-trainable params: 0 (0.00 Byte)
____________

In [116]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
train_dataset = tf.data.Dataset.zip((X_train_dataset, X_test_dataset)).batch(32)
# Train the model
model.fit(train_dataset, epochs=3)

Epoch 1/3


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2f4f95310>

In [117]:
test_dataset = tf.data.Dataset.zip((y_train_dataset, y_test_dataset)).batch(32)
# Evaluate the model
model.evaluate(test_dataset)




[1.6045989990234375, 0.22648084163665771]