# ***1 _ Mounting Drive***

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# ***2 _ Unzipping the IELTS Dataset***

In [3]:
import zipfile
import os

# Define paths
zip_path = '/content/drive/MyDrive/AES_datasets/IELTS_Dataset.zip'
extract_path = '/content/IELTS_Dataset'

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Dataset extracted to:", extract_path)


✅ Dataset extracted to: /content/IELTS_Dataset


# ***3 _ Loading the Dataset***

In [4]:
import os

# List files in the dataset folder
dataset_path = '/content/IELTS_Dataset'
print(os.listdir(dataset_path))

['ielts_writing_dataset.csv']


In [5]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/IELTS_Dataset/ielts_writing_dataset.csv')

# Show basic info
print(df.shape)
df.head()


(1435, 9)


Unnamed: 0,Task_Type,Question,Essay,Examiner_Commen,Task_Response,Coherence_Cohesion,Lexical_Resource,Range_Accuracy,Overall
0,1,The bar chart below describes some changes abo...,"Between 1995 and 2010, a study was conducted r...",,,,,,5.5
1,2,Rich countries often give money to poorer coun...,Poverty represents a worldwide crisis. It is t...,,,,,,6.5
2,1,The bar chart below describes some changes abo...,The left chart shows the population change hap...,,,,,,5.0
3,2,Rich countries often give money to poorer coun...,Human beings are facing many challenges nowada...,,,,,,5.5
4,1,The graph below shows the number of overseas v...,Information about the thousands of visits from...,,,,,,7.0


In [6]:
print(df.columns)

Index(['Task_Type', 'Question', 'Essay', 'Examiner_Commen', 'Task_Response',
       'Coherence_Cohesion', 'Lexical_Resource', 'Range_Accuracy', 'Overall'],
      dtype='object')


# ***4 _ Cleaning the Text***

In [7]:
import re

def clean_text(text):
    text = str(text).lower()                            # lowercase
    text = re.sub(r'[^a-z\s]', '', text)                # remove special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip()            # remove extra whitespaces
    return text

# Apply the cleaning function to a new column
df['clean_essay'] = df['Essay'].apply(clean_text)

# Show a preview
df[['Essay', 'clean_essay']].head()


Unnamed: 0,Essay,clean_essay
0,"Between 1995 and 2010, a study was conducted r...",between and a study was conducted representing...
1,Poverty represents a worldwide crisis. It is t...,poverty represents a worldwide crisis it is th...
2,The left chart shows the population change hap...,the left chart shows the population change hap...
3,Human beings are facing many challenges nowada...,human beings are facing many challenges nowada...
4,Information about the thousands of visits from...,information about the thousands of visits from...


# ***5 _ Extracting Text-Based Features from Dataset***

In [8]:
import numpy as np

def extract_features(df):
    df['essay_len'] = df['clean_essay'].apply(len)
    df['word_count'] = df['clean_essay'].apply(lambda x: len(x.split()))
    df['avg_word_len'] = df['clean_essay'].apply(lambda x: np.mean([len(word) for word in x.split()]) if x.split() else 0)
    return df

df = extract_features(df)

# Preview the features
df[['clean_essay', 'essay_len', 'word_count', 'avg_word_len']].head()


Unnamed: 0,clean_essay,essay_len,word_count,avg_word_len
0,between and a study was conducted representing...,728,119,5.12605
1,poverty represents a worldwide crisis it is th...,1537,258,4.96124
2,the left chart shows the population change hap...,867,150,4.786667
3,human beings are facing many challenges nowada...,1883,298,5.322148
4,information about the thousands of visits from...,874,146,4.993151


# ***6 _ Splitting the Data into Training and Testing Sets***

In [9]:
from sklearn.model_selection import train_test_split

# Set features and target
X_text = df['clean_essay']
X_numeric = df[['essay_len', 'word_count', 'avg_word_len']]
y = df['Overall']  # <-- Make sure this is the correct name of your target column

# Split text and numeric features + target
X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_text, X_numeric, y, test_size=0.2, random_state=42)

# Combine text + numeric into final training/test sets
X_train_combined = X_text_train.to_frame(name='text')
X_train_combined[['essay_len', 'word_count', 'avg_word_len']] = X_num_train

X_test_combined = X_text_test.to_frame(name='text')
X_test_combined[['essay_len', 'word_count', 'avg_word_len']] = X_num_test


# ***7 _ Building & Training the Model***

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# 1. Define text processing pipeline
text_transformer = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2)))
])

# 2. Define numeric processing pipeline
numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])

# 3. Combine both in a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'text'),
        ('num', numeric_transformer, ['essay_len', 'word_count', 'avg_word_len'])
    ]
)

# 4. Create full pipeline with RandomForestRegressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# 5. Train the model
pipeline.fit(X_train_combined, y_train)


# ***8 _ Evaluating the Model***

In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions
y_pred = pipeline.predict(X_test_combined)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print("📊 Evaluation Results:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2:.4f}")


📊 Evaluation Results:
Mean Absolute Error (MAE): 0.6478
Mean Squared Error (MSE): 0.7024
R² Score: 0.3232


# ***9 _ Saving the Model***

In [12]:
import joblib

# Save the entire pipeline (preprocessing + model)
joblib.dump(pipeline, 'ielts_aes_model.pkl')


['ielts_aes_model.pkl']

In [13]:
# UPLOADING THE FILE TO DRIVE
!cp ielts_aes_model.pkl /content/drive/MyDrive/AES_datasets/

In [14]:
# DOWNLOADING THE PKL FILE IN SYSTEM
from google.colab import files
files.download('ielts_aes_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# ***10 _ Making Own Predictions***

In [16]:
# a) LOADING THE MODEL
model = joblib.load('/content/drive/MyDrive/AES_datasets/ielts_aes_model.pkl')

In [44]:
# b) INPUT AN ESSAY
essay = """ There is no doubt that having some leisure time during studying reenergizes the brain to continue working efficiently. However, students in some countries are under extreme pressure to study hard and therefore, they have minimal leisure time. The possible reasons for this trend as well as suggested solutions will be discussed in details.

One possible reason for students to face a lot of pressure to work hard on their education with no time off would be the high cost of education. For instance, expensive courses put a financial burden on families and students which forces the students to try hard to complete these courses successfully and quickly. As a result, these students ignore the need for some spare time and focus on their study work. Another possible reason would be the amount of study materials which is becoming extensive for a short semester. Consequently, this pressure leaves no choice for students except to study as hard as possible to be able to finish this material on time. Thus, it is obvious that these students have no time left to have some leisure activities.

However, some solutions could be suggested to help solve this problem. One possible solution would be reducing the cost of educational courses in these countries by government fundings. By doing this, both the students and their families would have less financial pressure and therefore the students could be less stressed during their studies which might enable them to have some free time. Another solution would be study groups, if students study in groups, then each one of the group members could summarize part of the curriculum and shares it with the rest of the group. This would save a lot of time for all of the students in the group and as a result the amount of pressure would be reduced. These suggestions could help the students to have some leisure time which is important for them to stay focused.

In conclusion, there are many reasons that put the students in some countries under stress and pressure to study hard and leave them no time for leisure activities, however, the above suggested solutions could tackle this problem and allow the students to have some study free time which is essential for them to recharge their energy. """

In [55]:
# c)  Clean the essay
cleaned_essay = clean_text(essay)

# Create a DataFrame with one row
single_df = pd.DataFrame({'text': [cleaned_essay]})

# Extract features
single_df['essay_len'] = cleaned_essay.__len__()
single_df['word_count'] = len(cleaned_essay.split())
single_df['avg_word_len'] = np.mean([len(word) for word in cleaned_essay.split()]) if cleaned_essay.split() else 0

# Reorder columns for prediction
X_single = single_df[['text', 'essay_len', 'word_count', 'avg_word_len']]

# Predict
predicted_score = model.predict(X_single)[0]

# Show result
print(f"Predicted Score: {round(predicted_score, 2)}")


Predicted Score: 7.18




---



# ***REQUIREMENTS FILE***


In [46]:
# Save current environment's installed packages to requirements.txt
!pip freeze > requirements.txt

In [47]:
from google.colab import files
files.download('requirements.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>