In [1]:
# Install required packages
!pip install transformers torch pandas gradio


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:

import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import LongformerForSequenceClassification, LongformerTokenizer, Trainer, TrainingArguments
import gradio as gr



In [3]:
# Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:

# Read the dataset from CSV
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/data/Ultimate grading 1.csv')

# Remove rows with all empty cells
df.dropna(how='all', axis=0, inplace=True)

# Remove columns with all empty cells 
df.dropna(how='all', axis=1, inplace=True)

# Save the cleaned CSV 
df.to_csv('output.csv', index=False)



In [5]:
df

Unnamed: 0,Name,Text,Label
0,Essay_1,"""Stay hungry, stay foolish."" by Steve Jobs was...",2.0
1,Essay_2,Personal Statement\n\nAs I write this from my ...,1.0
2,Essay_3,"Personal Statement\n\nBack in February 2020, I...",2.0
3,Essay_4,My name is Chanyada Ruangdej. I was born and r...,1.0
4,Essay_5,Globalization and advanced technology have pla...,2.0
...,...,...,...
100,Essay_101,I feel fortunate to be applying to the Interna...,1.0
101,Essay_102,The pursuit of knowledge and excellence in the...,0.0
102,Essay_103,The study of human resources and their impact ...,0.0
103,Essay_104,As a student with a passion for the biological...,0.0


In [6]:
# Define the dataset class
class PersonalStatementDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [7]:

# Load the tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')



In [8]:
from sklearn.model_selection import train_test_split


In [9]:

# Split the dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)



In [10]:
# Create train and validation datasets
train_dataset = PersonalStatementDataset(
    texts=train_df['Text'].to_numpy(),
    labels=train_df['Label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=1024
)

val_dataset = PersonalStatementDataset(
    texts=val_df['Text'].to_numpy(),
    labels=val_df['Label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=1024
)

# Load the pre-trained model
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels=3)



Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weig

In [11]:
pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
pip install accelerate -U`

/bin/bash: -c: line 0: unexpected EOF while looking for matching ``'
/bin/bash: -c: line 1: syntax error: unexpected end of file


In [13]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)



In [14]:
# Create the Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)



In [15]:
# Train the model
trainer.train()





Epoch,Training Loss,Validation Loss
1,1.0863,1.078473
2,1.0591,1.054624
3,0.9541,1.020588
4,0.945,0.960777
5,0.6651,0.789785
6,0.5895,0.730402


TrainOutput(global_step=126, training_loss=0.9006345972182259, metrics={'train_runtime': 66.7483, 'train_samples_per_second': 7.551, 'train_steps_per_second': 1.888, 'total_flos': 331055593537536.0, 'train_loss': 0.9006345972182259, 'epoch': 6.0})

In [16]:
# Define the prediction function
def predict_personal_statement(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=2048,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(model.device)
    attention_mask = encoding['attention_mask'].to(model.device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probabilities).item()

    return {0: "Not Qualified", 1: "Under Consideration", 2: "Qualified"}[predicted_class]


In [17]:
# Select the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the selected device
model.to(device)

# Define the text variable with your input statement
text = """"""
# Move the input tensors to the selected device
encoding = tokenizer.encode_plus(
    text,
    add_special_tokens=True,
    max_length=2048,
    return_token_type_ids=False,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
)

input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)

# Perform the forward pass
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted_class = torch.argmax(probabilities).item()

# Define the class labels
class_labels = {0: "Not Qualified", 1: "Under Consideration", 2: "Qualified"}

# Get the predicted label
predicted_label = class_labels[predicted_class]

# Print the predicted label
print("Predicted Label:", predicted_label)



Predicted Label: Not Qualified


In [22]:
# Create the Gradio interface
iface = gr.Interface(fn=predict_personal_statement, inputs="text", outputs="label")
iface.launch()

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



In [19]:
# Define the output directory
output_dir = "/content/gdrive/MyDrive/Colab Notebooks/model trained"

# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


('/content/gdrive/MyDrive/Colab Notebooks/model trained/tokenizer_config.json',
 '/content/gdrive/MyDrive/Colab Notebooks/model trained/special_tokens_map.json',
 '/content/gdrive/MyDrive/Colab Notebooks/model trained/vocab.json',
 '/content/gdrive/MyDrive/Colab Notebooks/model trained/merges.txt',
 '/content/gdrive/MyDrive/Colab Notebooks/model trained/added_tokens.json')