<a href="https://colab.research.google.com/github/AmriteshDOT/nlp_/blob/main/src/trainregressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

import os
from IPython.display import display
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)


In [None]:
essay = pd.read_csv("/data/train.csv")
essay = essay.head(500)

In [None]:
df=essay.copy()
df['score'] = df['score'] / 5

In [None]:
sp_token = "[BR]"
def normalize_text(s):
    if pd.isna(s):
        return ""
    return str(s).replace("\r\n", "\n").replace("\r", "\n").replace("\n", f" {sp_token} ")

df['full_text']=df['full_text'].apply(normalize_text)

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_scores, val_scores = train_test_split(
    df['full_text'], df['score'], test_size=0.1, random_state=42
)

In [None]:
modeltype="distilbert/distilbert-base-uncased"
# !pip install --upgrade pip
# !pip install --upgrade transformers huggingface_hub PyYAML
import transformers
import huggingface_hub
import yaml

print(transformers.__version__)
print(huggingface_hub.__version__)
print(yaml.__version__)

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(modeltype, use_fast=True)
if sp_token not in tokenizer.get_added_vocab():
    tokenizer.add_special_tokens({"additional_special_tokens": [sp_token]})

In [None]:
def tokenize(texts):
    return tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=256,
        return_tensors='pt'
    )

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
class essayds(Dataset):

    def __init__(self,texts,scores):
        self.texts=texts
        self.scores=scores.values.astype(np.float32)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self,idx):
        enc = tokenizer(
        self.texts.iloc[idx],
        padding='max_length',
        truncation=True,
        max_length=256,
        return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item['score'] = torch.tensor(self.scores[idx], dtype=torch.float)
        return item

In [None]:
train_dataset = essayds(train_texts, train_scores)
val_dataset = essayds(val_texts, val_scores)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
from transformers import AutoModel

class regressor(nn.Module):

    def __init__(self, model_name, mlm_path=None):
        super().__init__()
        if mlm_path:
            self.model = AutoModel.from_pretrained(mlm_path)
            self.model.resize_token_embeddings(len(tokenizer))
        else:
            self.model = AutoModel.from_pretrained(model_name)
            self.model.resize_token_embeddings(len(tokenizer))

        self.fc = nn.Linear(self.model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        hidden = outputs.last_hidden_state  #blh
        pooled = hidden.mean(dim=1)  #meanPool
        return self.fc(pooled).squeeze(-1)

In [None]:
model = regressor(modeltype)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
#train
from sklearn.metrics import cohen_kappa_score
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.MSELoss()

for epoch in range(2):
    #train
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        scores = batch['score'].to(device)
        preds = model(input_ids, attention_mask)
        loss = criterion(preds, scores)
        loss.backward()
        optimizer.step()

    #val
    model.eval()
    val_preds, val_true = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            scores = batch['score'].to(device)
            preds = model(input_ids, attention_mask)
            val_preds.extend(preds.cpu().numpy())
            val_true.extend(scores.cpu().numpy())
    val_preds_cat = pd.cut(np.array(val_preds)*5,
                           [-np.inf, 0.833, 1.667, 2.5, 3.333, 4.167, np.inf],
                           labels=[0,1,2,3,4,5])
    val_true_cat = pd.cut(np.array(val_true)*5,
                          [-np.inf, 0.833, 1.667, 2.5, 3.333, 4.167, np.inf],
                          labels=[0,1,2,3,4,5])
    kappa = cohen_kappa_score(val_true_cat, val_preds_cat, weights='quadratic')
    print(f"Epoch {epoch+1} - Validation QWK: {kappa:.4f}")

#save
torch.save(model.state_dict(), 'essay_regressor.pth')
tokenizer.save_pretrained('./tokenizer')

In [None]:
#Inferenece

In [None]:
tokenizerr = AutoTokenizer.from_pretrained("/tokenizer")
model = regressor(modeltype, mlm_path=None, tokenizer=tokenizerr)
state = torch.load("essay_regressor.pth", map_location="cpu")
model.load_state_dict(state)
model.to(device)
model.eval()

In [None]:
def val2grade(arr):
    scaled = np.array(arr).reshape(-1) * 5.0
    edges = [-np.inf, 0.83333333, 1.66666667, 2.5, 3.33333333, 4.16666667, np.inf]
    labels = np.digitize(scaled, edges) - 1
    labels = np.clip(labels, 0, 5)
    return (labels + 1).tolist()

In [None]:
def preprocess(texts):
    enc = tokenizerr(
        texts if isinstance(texts, list) else [texts],
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )
    return enc

In [None]:
def predict(texts):
    enc = preprocess(texts)
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        out = model(enc["input_ids"], enc["attention_mask"]).cpu().numpy()
    grades = val2grade(out)
    return out.tolist(), grades

In [None]:
texts=["Trees are life's silent guardians, purifying the air and water we need to survive. They offer shade, food, and building materials, and their roots prevent soil erosion. Observing them can be a peaceful, relaxing experience, but their role extends to climate regulation and supporting countless other species. Their importance makes it our duty to protect and plant more trees for a healthier planet."]
predict(texts)