<a href="https://colab.research.google.com/github/Ayushee-Seeburrun/ASAG-with-Data-Augmentation/blob/main/originaldataset_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import time

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [None]:
#loading the split dataset
dataset_path = "/content/drive/MyDrive/Data Augmentation/Dataset Splits"
train_df = pd.read_csv(f"{dataset_path}/train_set.csv")
test_df = pd.read_csv(f"{dataset_path}/test_set.csv")

In [None]:
train_df = train_df[["EssayText", "Score2"]].dropna()
test_df = test_df[["EssayText", "Score2"]].dropna()

train_df["EssayText"] = train_df["EssayText"].astype(str)
test_df["EssayText"] = test_df["EssayText"].astype(str)

train_df["Score2"] = train_df["Score2"].astype(int)
test_df["Score2"] = test_df["Score2"].astype(int)

print("Train samples: ", len(train_df))
print("Test samples: ", len(test_df))
print("Train class distribution: ", train_df["Score2"].value_counts().sort_index())
print("Test class distribution: ", test_df["Score2"].value_counts().sort_index())

#Building the model
model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=20000, ngram_range=(1,2))),
     ("lr", LogisticRegression())

])

#Training the model
x_train = train_df["EssayText"]
y_train = train_df["Score2"]

startTime = time.time()

model.fit(x_train, y_train)

endTime = time.time()
trainTime = endTime - startTime

#testing the model
x_test = test_df["EssayText"]
y_test = test_df["Score2"]
y_predict = model.predict(x_test)

acc = accuracy_score(y_test, y_predict)
macro_f1 = f1_score(y_test, y_predict, average="macro")
weighted_f1 = f1_score(y_test, y_predict, average="weighted")

print("\n\n-----Logistic Regression on original dataset-----\n")
print(f"\nTraining Time: {trainTime:.4f}seconds")
print(f"Accuracy:  {acc:.4f}")
print(f"Macro F1:  {macro_f1:.4f}")
print(f"Weighted F1:  {weighted_f1:.4f}")
print("\nClassification Report: \n")
print(classification_report(y_test, y_predict, digits=4))

Train samples:  13765
Test samples:  3442
Train class distribution:  Score2
0    5424
1    4518
2    3225
3     598
Name: count, dtype: int64
Test class distribution:  Score2
0    1356
1    1130
2     807
3     149
Name: count, dtype: int64


-----Logistic Regression on original dataset-----


Training Time: 6.7611seconds
Accuracy:  0.6726
Macro F1:  0.6134
Weighted F1:  0.6707

Classification Report: 

              precision    recall  f1-score   support

           0     0.7709    0.7891    0.7799      1356
           1     0.6102    0.6274    0.6187      1130
           2     0.6081    0.5923    0.6001       807
           3     0.5472    0.3893    0.4549       149

    accuracy                         0.6726      3442
   macro avg     0.6341    0.5995    0.6134      3442
weighted avg     0.6703    0.6726    0.6707      3442

