In [11]:
%pip install chardet

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset

In [13]:
# Load dataset with detected encoding
df = pd.read_csv('all-data-engineering-quiz-utf.csv')

In [14]:
# Combine relevant text fields for input, joining reasoning fields if they are arrays
df['input_text'] = (
    df['Question'] + " " +
    df['Student Answer'] + " " +
    df['Ideal Answer'] + 
    " Grammar Reasoning: " + df['Grammar Reasoning'].apply(lambda x: ' | '.join(eval(x)) if isinstance(x, str) and x.startswith('[') else str(x)) + 
    " Structure Reasoning: " + df['Structure Reasoning'].apply(lambda x: ' | '.join(eval(x)) if isinstance(x, str) and x.startswith('[') else str(x))
)

# Define target columns
df['content_relevancy_score'] = df['Content Relevancy Score'].astype(float)
df['grammar_score'] = df['Grammar Score'].astype(float)
df['structure_score'] = df['Structure Score'].astype(float)

# Scale scores to range from 0 to 1 (optional for normalized regression)
df['content_relevancy_score'] /= 3  # Normalize content relevancy score to 0-1
df['grammar_score'] /= 5            # Normalize grammar score to 0-1
df['structure_score'] /= 5          # Normalize structure score to 0-1

# Train-test split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)



In [15]:
from transformers import BertConfig

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_pretrained('bert-base-uncased', hidden_dropout_prob=0.3)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Custom Dataset Class
class EssayDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['input_text']
        content_score = self.dataframe.iloc[idx]['content_relevancy_score']
        grammar_score = self.dataframe.iloc[idx]['grammar_score']
        
        # Tokenize input text
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Combine scores into a single tensor for regression
        labels = torch.tensor([content_score, grammar_score], dtype=torch.float)
        
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels
        }

# Create datasets
max_length = 512
train_dataset = EssayDataset(train_df, tokenizer, max_length)
val_dataset = EssayDataset(val_df, tokenizer, max_length)


In [17]:
# Define custom metrics function for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    mse = mean_squared_error(labels, predictions, squared=False)
    return {'rmse': mse}

from transformers import AdamW, Trainer, TrainingArguments

# Adjusted Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-6,              # Lower learning rate
    per_device_train_batch_size=8,   # Reduce batch size if needed for memory
    per_device_eval_batch_size=8,
    num_train_epochs=50,             # Increase epochs
    weight_decay=0.02,               # Increased weight decay
    load_best_model_at_end=True,
    metric_for_best_model="rmse",
    greater_is_better=False,
    logging_dir='./logs',
    logging_steps=10,
    gradient_accumulation_steps=2    # Simulate larger batch size
)

# Initialize Trainer with custom optimizer if necessary
optimizer = AdamW(model.parameters(), lr=5e-6, weight_decay=0.02)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)     # Custom optimizer if needed
)

# Fine-tune the model
trainer.train()


  2%|▏         | 2/100 [01:21<55:36, 34.05s/it]

{'eval_loss': 0.7782740592956543, 'eval_rmse': 1.0445607900619507, 'eval_runtime': 5.2576, 'eval_samples_per_second': 1.902, 'eval_steps_per_second': 0.38, 'epoch': 0.8}



  5%|▌         | 5/100 [02:46<48:42, 30.76s/it]

{'eval_loss': 0.7293118238449097, 'eval_rmse': 0.896094799041748, 'eval_runtime': 5.7719, 'eval_samples_per_second': 1.733, 'eval_steps_per_second': 0.347, 'epoch': 2.0}



  7%|▋         | 7/100 [04:07<52:28, 33.86s/it]

{'eval_loss': 0.7028959393501282, 'eval_rmse': 0.8080422282218933, 'eval_runtime': 5.4589, 'eval_samples_per_second': 1.832, 'eval_steps_per_second': 0.366, 'epoch': 2.8}


 10%|█         | 10/100 [05:24<46:00, 30.67s/it]

{'loss': 1.3912, 'grad_norm': 5.091360569000244, 'learning_rate': 4.5e-06, 'epoch': 4.0}



 10%|█         | 10/100 [05:30<46:00, 30.67s/it]

{'eval_loss': 0.6714636087417603, 'eval_rmse': 0.6924247145652771, 'eval_runtime': 5.7646, 'eval_samples_per_second': 1.735, 'eval_steps_per_second': 0.347, 'epoch': 4.0}



 12%|█▏        | 12/100 [06:53<49:24, 33.69s/it]

{'eval_loss': 0.654833197593689, 'eval_rmse': 0.6280641555786133, 'eval_runtime': 6.1381, 'eval_samples_per_second': 1.629, 'eval_steps_per_second': 0.326, 'epoch': 4.8}



 15%|█▌        | 15/100 [08:46<52:48, 37.28s/it]

{'eval_loss': 0.6286916136741638, 'eval_rmse': 0.5241686701774597, 'eval_runtime': 12.7961, 'eval_samples_per_second': 0.781, 'eval_steps_per_second': 0.156, 'epoch': 6.0}



 17%|█▋        | 17/100 [10:51<1:05:43, 47.52s/it]

{'eval_loss': 0.6152233481407166, 'eval_rmse': 0.46928638219833374, 'eval_runtime': 6.6901, 'eval_samples_per_second': 1.495, 'eval_steps_per_second': 0.299, 'epoch': 6.8}


 20%|██        | 20/100 [12:25<53:02, 39.78s/it]  

{'loss': 1.2758, 'grad_norm': 4.329855918884277, 'learning_rate': 4.000000000000001e-06, 'epoch': 8.0}



 20%|██        | 20/100 [12:32<53:02, 39.78s/it]

{'eval_loss': 0.6072980165481567, 'eval_rmse': 0.4408215880393982, 'eval_runtime': 6.816, 'eval_samples_per_second': 1.467, 'eval_steps_per_second': 0.293, 'epoch': 8.0}



 22%|██▏       | 22/100 [14:17<56:03, 43.12s/it]

{'eval_loss': 0.6059160232543945, 'eval_rmse': 0.4378731846809387, 'eval_runtime': 7.2815, 'eval_samples_per_second': 1.373, 'eval_steps_per_second': 0.275, 'epoch': 8.8}



 25%|██▌       | 25/100 [16:02<48:48, 39.04s/it]

{'eval_loss': 0.6032838821411133, 'eval_rmse': 0.42936092615127563, 'eval_runtime': 6.9952, 'eval_samples_per_second': 1.43, 'eval_steps_per_second': 0.286, 'epoch': 10.0}



 27%|██▋       | 27/100 [17:40<50:36, 41.60s/it]

{'eval_loss': 0.5988081097602844, 'eval_rmse': 0.40932101011276245, 'eval_runtime': 6.4148, 'eval_samples_per_second': 1.559, 'eval_steps_per_second': 0.312, 'epoch': 10.8}


 30%|███       | 30/100 [19:22<46:01, 39.46s/it]

{'loss': 1.1973, 'grad_norm': 2.948061466217041, 'learning_rate': 3.5e-06, 'epoch': 12.0}



 30%|███       | 30/100 [19:29<46:01, 39.46s/it]

{'eval_loss': 0.5895353555679321, 'eval_rmse': 0.36469489336013794, 'eval_runtime': 7.6403, 'eval_samples_per_second': 1.309, 'eval_steps_per_second': 0.262, 'epoch': 12.0}



 32%|███▏      | 32/100 [21:25<50:52, 44.89s/it]

{'eval_loss': 0.5840943455696106, 'eval_rmse': 0.3387032747268677, 'eval_runtime': 8.708, 'eval_samples_per_second': 1.148, 'eval_steps_per_second': 0.23, 'epoch': 12.8}



 35%|███▌      | 35/100 [23:26<47:47, 44.11s/it]

{'eval_loss': 0.5745851397514343, 'eval_rmse': 0.2942020297050476, 'eval_runtime': 7.4701, 'eval_samples_per_second': 1.339, 'eval_steps_per_second': 0.268, 'epoch': 14.0}



 37%|███▋      | 37/100 [25:08<47:24, 45.15s/it]

{'eval_loss': 0.5675152540206909, 'eval_rmse': 0.2673319876194, 'eval_runtime': 5.6649, 'eval_samples_per_second': 1.765, 'eval_steps_per_second': 0.353, 'epoch': 14.8}


 40%|████      | 40/100 [26:41<38:28, 38.48s/it]

{'loss': 1.2004, 'grad_norm': 3.042447090148926, 'learning_rate': 3e-06, 'epoch': 16.0}



 40%|████      | 40/100 [26:47<38:28, 38.48s/it]

{'eval_loss': 0.5578422546386719, 'eval_rmse': 0.25804734230041504, 'eval_runtime': 6.2116, 'eval_samples_per_second': 1.61, 'eval_steps_per_second': 0.322, 'epoch': 16.0}



 42%|████▏     | 42/100 [28:34<41:18, 42.73s/it]

{'eval_loss': 0.5547385811805725, 'eval_rmse': 0.26784685254096985, 'eval_runtime': 6.956, 'eval_samples_per_second': 1.438, 'eval_steps_per_second': 0.288, 'epoch': 16.8}



 45%|████▌     | 45/100 [30:05<32:45, 35.74s/it]

{'eval_loss': 0.5530937314033508, 'eval_rmse': 0.2913801372051239, 'eval_runtime': 5.8496, 'eval_samples_per_second': 1.71, 'eval_steps_per_second': 0.342, 'epoch': 18.0}



 47%|████▋     | 47/100 [31:29<32:22, 36.66s/it]

{'eval_loss': 0.5534512400627136, 'eval_rmse': 0.30321669578552246, 'eval_runtime': 5.6187, 'eval_samples_per_second': 1.78, 'eval_steps_per_second': 0.356, 'epoch': 18.8}


 50%|█████     | 50/100 [33:12<32:00, 38.42s/it]

{'loss': 1.18, 'grad_norm': 2.661003351211548, 'learning_rate': 2.5e-06, 'epoch': 20.0}



 50%|█████     | 50/100 [33:20<32:00, 38.42s/it]

{'eval_loss': 0.5552095174789429, 'eval_rmse': 0.31340864300727844, 'eval_runtime': 8.1155, 'eval_samples_per_second': 1.232, 'eval_steps_per_second': 0.246, 'epoch': 20.0}



 52%|█████▏    | 52/100 [35:26<37:42, 47.14s/it]

{'eval_loss': 0.5561641454696655, 'eval_rmse': 0.3193681836128235, 'eval_runtime': 8.0552, 'eval_samples_per_second': 1.241, 'eval_steps_per_second': 0.248, 'epoch': 20.8}



 55%|█████▌    | 55/100 [37:28<33:30, 44.69s/it]

{'eval_loss': 0.5582365989685059, 'eval_rmse': 0.32197147607803345, 'eval_runtime': 8.0098, 'eval_samples_per_second': 1.248, 'eval_steps_per_second': 0.25, 'epoch': 22.0}



 57%|█████▋    | 57/100 [39:36<36:02, 50.28s/it]

{'eval_loss': 0.5590425133705139, 'eval_rmse': 0.32564079761505127, 'eval_runtime': 8.5038, 'eval_samples_per_second': 1.176, 'eval_steps_per_second': 0.235, 'epoch': 22.8}


 60%|██████    | 60/100 [41:31<30:53, 46.34s/it]

{'loss': 1.1721, 'grad_norm': 3.5921783447265625, 'learning_rate': 2.0000000000000003e-06, 'epoch': 24.0}



 60%|██████    | 60/100 [41:39<30:53, 46.34s/it]

{'eval_loss': 0.5576098561286926, 'eval_rmse': 0.33413106203079224, 'eval_runtime': 7.9169, 'eval_samples_per_second': 1.263, 'eval_steps_per_second': 0.253, 'epoch': 24.0}



 62%|██████▏   | 62/100 [43:40<31:46, 50.16s/it]

{'eval_loss': 0.5556241273880005, 'eval_rmse': 0.3422885835170746, 'eval_runtime': 8.3014, 'eval_samples_per_second': 1.205, 'eval_steps_per_second': 0.241, 'epoch': 24.8}



 65%|██████▌   | 65/100 [45:43<26:39, 45.69s/it]

{'eval_loss': 0.552049458026886, 'eval_rmse': 0.35995781421661377, 'eval_runtime': 8.3077, 'eval_samples_per_second': 1.204, 'eval_steps_per_second': 0.241, 'epoch': 26.0}



 67%|██████▋   | 67/100 [47:44<27:30, 50.00s/it]

{'eval_loss': 0.5504186749458313, 'eval_rmse': 0.3675069212913513, 'eval_runtime': 7.9966, 'eval_samples_per_second': 1.251, 'eval_steps_per_second': 0.25, 'epoch': 26.8}


 70%|███████   | 70/100 [49:38<22:43, 45.46s/it]

{'loss': 1.1503, 'grad_norm': 2.6539595127105713, 'learning_rate': 1.5e-06, 'epoch': 28.0}



 70%|███████   | 70/100 [49:46<22:43, 45.46s/it]

{'eval_loss': 0.5485647320747375, 'eval_rmse': 0.3714125156402588, 'eval_runtime': 8.0654, 'eval_samples_per_second': 1.24, 'eval_steps_per_second': 0.248, 'epoch': 28.0}



 72%|███████▏  | 72/100 [51:47<23:08, 49.58s/it]

{'eval_loss': 0.5476928353309631, 'eval_rmse': 0.3738178610801697, 'eval_runtime': 7.9169, 'eval_samples_per_second': 1.263, 'eval_steps_per_second': 0.253, 'epoch': 28.8}



 75%|███████▌  | 75/100 [53:52<19:06, 45.86s/it]

{'eval_loss': 0.5467650890350342, 'eval_rmse': 0.3703385591506958, 'eval_runtime': 9.5282, 'eval_samples_per_second': 1.05, 'eval_steps_per_second': 0.21, 'epoch': 30.0}



 77%|███████▋  | 77/100 [56:05<20:08, 52.53s/it]

{'eval_loss': 0.5467864274978638, 'eval_rmse': 0.36528655886650085, 'eval_runtime': 8.436, 'eval_samples_per_second': 1.185, 'eval_steps_per_second': 0.237, 'epoch': 30.8}


 80%|████████  | 80/100 [58:01<15:28, 46.45s/it]

{'loss': 1.1362, 'grad_norm': 4.264032363891602, 'learning_rate': 1.0000000000000002e-06, 'epoch': 32.0}



 80%|████████  | 80/100 [58:08<15:28, 46.45s/it]

{'eval_loss': 0.5471011400222778, 'eval_rmse': 0.3577156662940979, 'eval_runtime': 7.3525, 'eval_samples_per_second': 1.36, 'eval_steps_per_second': 0.272, 'epoch': 32.0}



 82%|████████▏ | 82/100 [59:53<14:01, 46.75s/it]

{'eval_loss': 0.5474886298179626, 'eval_rmse': 0.3544919490814209, 'eval_runtime': 7.0909, 'eval_samples_per_second': 1.41, 'eval_steps_per_second': 0.282, 'epoch': 32.8}



 85%|████████▌ | 85/100 [1:01:49<10:35, 42.37s/it]

{'eval_loss': 0.5481659173965454, 'eval_rmse': 0.3505815863609314, 'eval_runtime': 11.0684, 'eval_samples_per_second': 0.903, 'eval_steps_per_second': 0.181, 'epoch': 34.0}



 87%|████████▋ | 87/100 [1:03:54<10:51, 50.15s/it]

{'eval_loss': 0.5488213300704956, 'eval_rmse': 0.34832507371902466, 'eval_runtime': 8.3119, 'eval_samples_per_second': 1.203, 'eval_steps_per_second': 0.241, 'epoch': 34.8}


 90%|█████████ | 90/100 [1:06:01<08:14, 49.47s/it]

{'loss': 1.1446, 'grad_norm': 2.742483615875244, 'learning_rate': 5.000000000000001e-07, 'epoch': 36.0}



 90%|█████████ | 90/100 [1:06:09<08:14, 49.47s/it]

{'eval_loss': 0.5495004653930664, 'eval_rmse': 0.3465659022331238, 'eval_runtime': 8.0924, 'eval_samples_per_second': 1.236, 'eval_steps_per_second': 0.247, 'epoch': 36.0}



 92%|█████████▏| 92/100 [1:07:58<06:28, 48.57s/it]

{'eval_loss': 0.5498220920562744, 'eval_rmse': 0.34593766927719116, 'eval_runtime': 7.9367, 'eval_samples_per_second': 1.26, 'eval_steps_per_second': 0.252, 'epoch': 36.8}



 95%|█████████▌| 95/100 [1:10:05<03:50, 46.14s/it]

{'eval_loss': 0.5502162575721741, 'eval_rmse': 0.3449614346027374, 'eval_runtime': 8.1641, 'eval_samples_per_second': 1.225, 'eval_steps_per_second': 0.245, 'epoch': 38.0}



 97%|█████████▋| 97/100 [1:12:10<02:32, 50.96s/it]

{'eval_loss': 0.5502774119377136, 'eval_rmse': 0.345205694437027, 'eval_runtime': 8.0467, 'eval_samples_per_second': 1.243, 'eval_steps_per_second': 0.249, 'epoch': 38.8}


100%|██████████| 100/100 [1:14:10<00:00, 47.24s/it]

{'loss': 1.1482, 'grad_norm': 2.0123512744903564, 'learning_rate': 0.0, 'epoch': 40.0}



100%|██████████| 100/100 [1:14:22<00:00, 47.24s/it]

{'eval_loss': 0.5502617359161377, 'eval_rmse': 0.34550225734710693, 'eval_runtime': 8.9528, 'eval_samples_per_second': 1.117, 'eval_steps_per_second': 0.223, 'epoch': 40.0}


100%|██████████| 100/100 [1:14:25<00:00, 44.65s/it]

{'train_runtime': 4465.3285, 'train_samples_per_second': 0.403, 'train_steps_per_second': 0.022, 'train_loss': 1.1996148586273194, 'epoch': 40.0}





TrainOutput(global_step=100, training_loss=1.1996148586273194, metrics={'train_runtime': 4465.3285, 'train_samples_per_second': 0.403, 'train_steps_per_second': 0.022, 'total_flos': 378879919718400.0, 'train_loss': 1.1996148586273194, 'epoch': 40.0})

In [20]:
# Evaluate the model
metrics = trainer.evaluate()
print(metrics)

100%|██████████| 2/2 [00:01<00:00,  1.16it/s]

{'eval_loss': 0.5578422546386719, 'eval_rmse': 0.25804734230041504, 'eval_runtime': 8.6532, 'eval_samples_per_second': 1.156, 'eval_steps_per_second': 0.231, 'epoch': 40.0}





In [19]:
# Save the fine-tuned model
trainer.save_model('./essay_scoring_model')
tokenizer.save_pretrained('./essay_scoring_model')

('./essay_scoring_model\\tokenizer_config.json',
 './essay_scoring_model\\special_tokens_map.json',
 './essay_scoring_model\\vocab.txt',
 './essay_scoring_model\\added_tokens.json')