In [None]:
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoModel, AdamW, AutoTokenizer
from tqdm import tqdm

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
df = pd.read_csv("task1.csv")
df.head()

Unnamed: 0,text,label
0,"hai, aduku woru seagret irukkintu.",3
1,amar naam hal anant. kono parichampatra prayoj...,0
2,"anjuki, tripul jumplo maji jaatiya champian ma...",4
3,"pildu hackey enpatu oruvit ani vilyaddu, atil ...",3
4,"mam, shunesi ye nirdisht kono jayga naam nathi...",0


In [4]:
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

In [5]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len):
        super(CustomDataset, self).__init__()

        self.data=data['text']
        self.target = data['label']
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        target = self.target[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            max_length=self.max_len,
            padding='max_length'
        )

        return {
            "ids": torch.tensor(encoding["input_ids"], dtype=torch.long),
            "mask": torch.tensor(encoding["attention_mask"], dtype=torch.long),
            "target": torch.tensor(target, dtype=torch.long)
        }

train_ds = CustomDataset(df, tokenizer, 512)

In [6]:
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)

In [7]:
class Model(torch.nn.Module):
    def __init__(self, out_dim):
        super(Model, self).__init__()

        self.bert = AutoModel.from_pretrained("google/muril-base-cased")
        self.bert.requires_grad=False
        self.fc = torch.nn.Linear(768, out_dim)

    def forward(self, ids, mask):
        _, out = self.bert(ids, attention_mask=mask, return_dict=False)
        out = self.fc(out)
        return out

model = Model(5)
model = model.to(device)

In [8]:
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")

237,560,069 total parameters.


In [None]:
optimizer = AdamW(model.parameters(),lr = 2e-5)

criterion = nn.CrossEntropyLoss()

epochs = 3

In [10]:
def get_accuracy(y_pred,y_actual):
    y_pred = torch.argmax(y_pred, axis=1)
    return (1/len(y_actual))*torch.sum(torch.round(y_pred)==y_actual)

In [11]:
for epoch in range(epochs):
    train_running_loss = 0
    train_running_acc = 0
    tqdm_train_iterator = tqdm(enumerate(train_dl),
                                    desc=f"[train]{epoch+1}/{epochs}",
                                    ascii=True,leave=True,
                                    total=len(train_dl),
                                    colour="green",position=0)
    for batch_idx,data in tqdm_train_iterator:
        ids = data["ids"]
        mask = data["mask"]
        target = data["target"]

        ids = ids.to(device)
        mask = mask.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        y_pred = model(ids, mask)
        loss = criterion(y_pred, target)
        loss.backward()
        optimizer.step()

        train_running_loss += loss.item()
        train_running_acc += get_accuracy(y_pred.detach(),target)
            
        tqdm_train_iterator.set_postfix(avg_train_acc=f"{train_running_acc/(batch_idx+1):0.4f}",
                                        avg_train_loss=f"{(train_running_loss/(batch_idx+1)):0.4f}")

[train]1/3: 100%|[32m##########[0m| 266/266 [04:12<00:00,  1.05it/s, avg_train_acc=0.7699, avg_train_loss=1.3154]
[train]2/3: 100%|[32m##########[0m| 266/266 [04:11<00:00,  1.06it/s, avg_train_acc=0.9864, avg_train_loss=0.4759]
[train]3/3: 100%|[32m##########[0m| 266/266 [04:11<00:00,  1.06it/s, avg_train_acc=0.9906, avg_train_loss=0.1346]


In [12]:
final_df = pd.read_csv('classification.csv')
final_df.head()

Unnamed: 0,text
0,bahya rup hal sei parisheba pradankari maanush...
1,maharashtrer aurangabad jela avasthit ajanta c...
2,gayer raner sang dehrekha mishe gie ghanatber ...
3,yekhane poorvavarti aitihya chhil kath o mati ...
4,"alute masala makhie, fetano basena chubie nie ..."


In [13]:
def get_label(text):
    encoding = tokenizer.encode_plus(
            text,
            truncation=True,
            max_length=512,
            padding='max_length'
        )

    ids = torch.tensor(encoding["input_ids"], dtype=torch.long)
    mask = torch.tensor(encoding["attention_mask"], dtype=torch.long)
    
    ids = ids.reshape((1,-1))
    mask = mask.reshape((1,-1))
    ids = ids.to(device)
    mask = mask.to(device)

    y_pred = model(ids, mask)
    y_pred = torch.argmax(y_pred, axis=1)
    return y_pred.cpu().numpy()[0]

In [14]:
final_df['label'] = final_df['text'].apply(lambda x: get_label(x))
final_df.head()

Unnamed: 0,text,label
0,bahya rup hal sei parisheba pradankari maanush...,0
1,maharashtrer aurangabad jela avasthit ajanta c...,0
2,gayer raner sang dehrekha mishe gie ghanatber ...,0
3,yekhane poorvavarti aitihya chhil kath o mati ...,0
4,"alute masala makhie, fetano basena chubie nie ...",0


In [16]:
final_df.to_csv("task1_output.csv", index=False)