In [1]:
#Importing libraries and loading dataset

import pandas as pd
import numpy as np
from datasets import load_dataset

ds = load_dataset("jatinmehra/Automated-Essay-Scoring-2.0")
ds

README.md:   0%|          | 0.00/161 [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/36.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17307 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['essay_id', 'full_text', 'score'],
        num_rows: 17307
    })
})

In [2]:
train_ds = ds["train"]

In [3]:
df = train_ds.to_pandas()
df.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [4]:
pd.set_option("display.max_colwidth",300)
df.head(10)

Unnamed: 0,essay_id,full_text,score
0,000d118,"Many people have car where they live. The thing they don't know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in VAUBAN,Germany they dont have that proble because 70 percent of vauban's fam...",3
1,000fe60,"I am a scientist at NASA that is discussing the ""face"" on mars. I will be explaining how the ""face"" is a land form. By sharing my information about this isue i will tell you just that.\n\nFirst off, how could it be a martions drawing. There is no plant life on mars as of rite now that we know of...",3
2,001ab80,"People always wish they had the same technology that they have seen in movies, or the best new piece of technology that is all over social media. However, nobody seems to think of the risks that these kinds of new technologies may have. Cars have been around for many decades, and now manufacture...",4
3,001bdc0,"We all heard about Venus, the planet without almost oxygen with earthquakes, erupting volcanoes and temperatures average over 800 degrees Fahrenheit but what if scientist project the futur into this planet ? Through this article, the author uses evidences appealing to reason and concession to ma...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to argue in favor of keeping the Electoral College.""There are many reasons to keep the Electoral College"" one reason is because it is widely regarded as an anachronism, a dispute over the outcome of an Electoral College vote is possible, but it is less lik...",3
5,0030e86,"If I were to choose between keeping the electoral college or abolishing it, I would chose for abolishing it. The electoral college has a system that can be considered confusing to most americans. This system indirectly transfers citizen's votes to congress. This allows for loopholes and sabaotag...",4
6,0033037,The posibilty of a face reconizing computer would be very helpful for children or even adults. Its would be helpful in a ways of satifying our needs and why showing us thing that would make going on the internet better.\n\nI think that the face reconizing computer would be very reliable; because...,2
7,0033bf4,"What is the Seagoing Cowboys progam?\n\nIt was to help many countries that had been scarred by World War II.\n\nThese people helped with cattle and other animals that were shipped to Europe to help them.\n\nPeople were asked to help by the United Nations Relief and Rehabilitation Administration,...",3
8,0036253,The challenge of exploring Venus\n\nThis storie is about the challeng of exploring Venus. The auhor talked how venus is closest planet in earth. The author support the idea by showing us how studying venus is a worthy pursuit despite the dangers.\n\nin the story the author talked about how study...,2
9,0040e27,There are many reasons why you should join seagoing cowboys program. You would be helping your country. You'd be traveling around the world. Last you'd have lots of fun over seas.\n\nYou'd be helping our country because when Luke was in seagoing cowboy program it was 1945 and world war 2 was hap...,3


In [5]:
#Column cleanup and renaming
df = df.rename(columns={"full_text":"essay"})
df = df[['essay', 'score']]
df.columns.tolist()

['essay', 'score']

In [6]:
#dataset info
df.info()
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   essay   17307 non-null  object
 1   score   17307 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 270.6+ KB


essay    0
score    0
dtype: int64

In [7]:
df.isnull().sum()

essay    0
score    0
dtype: int64

In [8]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
#Text preprocessing
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\d+","",text)
    text = text.translate(str.maketrans("","",string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [10]:
#Applying preprocessing
df["essay"] = df["essay"].apply(preprocess_text)
df[["essay"]].head(2)

Unnamed: 0,essay
0,many people car live thing dont know use car alot thing happen like get accidet smoke car bad breath someone walk vaubangermany dont proble percent vaubans families carsand percent sold car move street parkig driveways home garages forbidden outskirts freiburd near french swiss borders probaly w...
1,scientist nasa discussing face mars explaining face land form sharing information isue tell first could martions drawing plant life mars rite know means far know possible type life explains could made martians also would martion build face big make since martian next landform many landforms weir...


In [11]:
#feature engineering
df["word_count"] = df["essay"].apply(lambda x: len(x.split()))
df["sentence_count"] = df["essay"].apply(lambda x: len(sent_tokenize(x)) )
df["avg_word_length"] = df["essay"].apply(lambda x: np.mean([len(word) for word in x.split()]))

In [12]:
#TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features = 5000,
    ngram_range = (1,2)
)
X_tfidf = tfidf.fit_transform(df["essay"])

In [13]:
#combining text with features
from scipy.sparse import hstack
features_ = df[["word_count","sentence_count","avg_word_length"]]

X = hstack([X_tfidf,features_])
y = df["score"]

In [14]:
#train_test_split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

print(f"X_train:{X_train.shape}\nX_test:{X_test.shape}\nY_train:{y_train.shape}\nY_test:{y_test.shape}")

X_train:(13845, 5003)
X_test:(3462, 5003)
Y_train:(13845,)
Y_test:(3462,)


In [15]:
#Model training
from xgboost import XGBRegressor


xgb = XGBRegressor(
    n_estimators=300,
    max_depth = 6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree =0.8,
    objective="reg:squarederror",
    random_state = 42,
    n_jobs=-1
)


In [24]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)


In [16]:
xgb.fit(X_train,y_train)

In [25]:
y_pred_xgb = xgb.predict(X_test)
y_pred_rf = rf.predict(X_test)

In [26]:
#Model evaluation

from sklearn.metrics import mean_squared_error,r2_score

def eval_model(name,y_true,y_pred):
    print(f"{name} Evaluation....")
    print(f"RMSE Value : {mean_squared_error(y_true,y_pred):3f}")
    print(f"R² Value : {r2_score(y_true,y_pred):.3f} ")

In [21]:
eval_model("XGBRegressor",y_test,y_pred_xgb)

XGBRegressor Evaluation....
RMSE Value : 0.373961
R² Value : 0.661 


In [27]:
eval_model("RandomForestRegressor",y_test,y_pred_rf)

RandomForestRegressor Evaluation....
RMSE Value : 0.418651
R² Value : 0.621 


In [29]:
#model comparison

comparison = pd.DataFrame({
    "Model": ["Random Forest", "XGBoost"],
    "RMSE": [mean_squared_error(y_test,y_pred_rf), mean_squared_error(y_test,y_pred_xgb)],
    "R2 Score": [r2_score(y_test,y_pred_rf), r2_score(y_test,y_pred_xgb)]
})

comparison


Unnamed: 0,Model,RMSE,R2 Score
0,Random Forest,0.418651,0.62055
1,XGBoost,0.373961,0.661055


In [54]:
#sample essay prediction

In [43]:
sample_row = df.iloc[2]

essay_text = sample_row["essay"]
true_score = sample_row["score"]



In [45]:
clean_essay = preprocess_text(essay_text)

essay_tfidf = tfidf.transform([clean_essay])

wc = len(essay_text.split())
sc = len(sent_tokenize(essay_text))
awl = np.mean([len(w) for w in essay_text.split()])

features_essay = np.array([wc,sc,awl]).reshape(1,-1)

In [47]:
min_score = df["score"].min()
max_score = df["score"].max()

In [48]:
final_input = hstack([essay_tfidf,features_essay])

# Predict
raw_pred = xgb.predict(final_input)[0]
clipped_pred = np.clip(raw_pred, min_score, max_score)

print("Model Predicted Score:", round(clipped_pred, 2))


Model Predicted Score: 4.11


In [49]:
#Prediction on a newly added essay
essay_1 = """
Education plays a vital role in shaping an individual's future.
It not only provides knowledge but also helps in developing critical thinking,
problem-solving skills, and moral values. A well-educated society is more likely
to achieve economic growth and social harmony.
"""

cleaned_essay_1 = preprocess_text(essay_1)

In [50]:
tfidf_essay_1 = tfidf.transform([cleaned_essay_1])

In [51]:
new_word_count = len(essay_1.split())
new_sentence_count = len(sent_tokenize(essay_1))
new_avg_word_length = np.mean([len(word) for word in essay_1.split()])

new_features_ = np.array([
    new_word_count,
    new_sentence_count,
    new_avg_word_length
]).reshape(1,-1)

In [52]:
#combining all features 
final_input = hstack([tfidf_essay_1,new_features_])

raw_pred = xgb.predict(final_input)[0]

clipped_pred = np.clip(raw_pred, min_score, max_score)

print("Raw Predicted Score:", round(raw_pred, 2))
print("Clipped Predicted Score:", round(clipped_pred, 2))


Raw Predicted Score: 1.94
Clipped Predicted Score: 1.94
