In [None]:
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from tqdm import tqdm

In [None]:

dir_train= "/content/drive/MyDrive/FOS/Train"
dir_test = "/content/drive/MyDrive/FOS/Test"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:

import os
import re
import json
import logging
import pandas as pd
from tqdm import tqdm

def text_preprocessing(text):
  text = text.lower()
  text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
  text = re.sub('[^a-zA-Z]', ' ', text)
  text = re.sub(' +', ' ', text)
  text = text.strip()
  return text

def load_data(dir_path):
    data = []
    for file_name in tqdm(os.listdir(dir_path)):
        with open(os.path.join(dir_path, file_name), 'r') as f:
            lines = f.readlines()

        for line in lines:
            text = text_preprocessing(line)
            data.append({
                'text': text,
                'label': file_name.replace("class_", "").replace(".txt", "")
            })
    return data


def create_dataframe(data):
  df = pd.DataFrame(data)
  df['text'] = df['text'].astype(str)
  df['label'] = df['label'].astype(str)
  return df


train_data = load_data(dir_train)
test_data = load_data(dir_test)

train_data = create_dataframe(train_data)
test_data = create_dataframe(test_data)


100%|██████████| 8/8 [00:02<00:00,  3.48it/s]
100%|██████████| 8/8 [00:02<00:00,  3.17it/s]


In [None]:

train_data['label'] = train_data['label'].astype(int)
test_data['label'] = test_data['label'].astype(int)


In [None]:
class BertDataset(Dataset):
    def __init__(self, tokenizer,max_length):
        super(BertDataset, self).__init__()
        #self.root_dir=root_dir
        self.train_csv=train_data
        self.tokenizer=tokenizer
        self.target=self.train_csv.iloc[:,1]
        self.max_length=max_length

    def __len__(self):
        return len(self.train_csv)

    def __getitem__(self, index):

        text1 = self.train_csv.iloc[index,0]

        inputs = self.tokenizer.encode_plus(
            text1 ,
            None,
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(self.train_csv.iloc[index, 1], dtype=torch.long)
            }
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

dataset= BertDataset(tokenizer, max_length=100)

dataloader=DataLoader(dataset=dataset,batch_size=32)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.bert_model = transformers.BertModel.from_pretrained("bert-base-uncased")
        self.out = nn.Linear(768, 8)

    def forward(self,ids,mask,token_type_ids):
        _,o2= self.bert_model(ids,attention_mask=mask,token_type_ids=token_type_ids, return_dict=False)

        out= self.out(o2)

        return out

model=BERT()

loss_fn = nn.CrossEntropyLoss()

#Initialize Optimizer
optimizer= optim.Adam(model.parameters(),lr= 0.0001)

for param in model.bert_model.parameters():
    param.requires_grad = False
from tqdm import tqdm
import torch

def finetune(epochs, dataloader, model, loss_fn, optimizer, device):
    model.train()
    model = model.to(device)

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        loop = tqdm(enumerate(dataloader), leave=False, total=len(dataloader))
        for batch, dl in loop:
            ids = dl['ids'].to(device)
            token_type_ids = dl['token_type_ids'].to(device)
            mask = dl['mask'].to(device)
            label = dl['target'].to(device)

            optimizer.zero_grad()

            output = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

            # Assuming CrossEntropyLoss
            loss = loss_fn(output.squeeze(), label)  # Squeeze the output to match the target shape
            loss.backward()

            optimizer.step()

            # Calculate accuracy
            _, predicted_labels = torch.max(output, 1)
            correct = (predicted_labels == label).sum().item()
            accuracy = correct / label.size(0)

            # Show progress while training
            loop.set_description(f'Epoch {epoch + 1}/{epochs}')
            loop.set_postfix(loss=loss.item(), acc=accuracy)

    return model


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
model=finetune(1, dataloader, model, loss_fn, optimizer, device)

Epoch 1/2


  0%|          | 0/104 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 2/2




In [None]:

def calculate_accuracy(predictions, true_labels):
    correct = sum(p == t for p, t in zip(predictions, true_labels))
    accuracy = correct / len(true_labels)
    return accuracy


def predict_on_test_data(model, test_dataloader, device):
    model.eval()
    model = model.to(device)
    predictions = []

    with torch.no_grad():
        for batch in tqdm(test_dataloader, leave=False, total=len(test_dataloader)):
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            output = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            predictions.extend(torch.argmax(output, dim=1).cpu().numpy().tolist())

    return predictions


In [None]:
import os
import pandas as pd
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from tqdm import tqdm

class  YourTestDataset(Dataset):
    def __init__(self, tokenizer, df, max_length, is_test=False):
        super(YourTestDataset, self).__init__()
        self.df = test_data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.df.iloc[index, 0]

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
        )

        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        if self.is_test:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'target': torch.tensor(self.df.iloc[index, 1], dtype=torch.long)
            }

# Assuming you have a test dataframe called test_df
test_dataset =  YourTestDataset(tokenizer, test_data, max_length=100, is_test=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)




true_labels =test_data['label']
# Adjust batch_size as needed

# Make predictions on the test data
predictions = predict_on_test_data(model, test_dataloader, device)
# Calculate and print accuracy
accuracy = calculate_accuracy(predictions,true_labels )
print(f"Accuracy on test data: {accuracy * 100:.2f}%")


                                                 

Accuracy on test data: 15.66%




In [None]:
true_labels.value_counts()

2    57
5    54
0    51
7    51
4    48
6    48
3    41
1    14
Name: label, dtype: int64

In [None]:
# prompt: convert list into series

predictions=pd.Series(predictions)


In [None]:
predictions.value_counts()

7    349
5     15
dtype: int64

In [None]:
train_data["label"].value_counts()

7    975
5    947
6    624
4    345
2    179
0    151
3     61
1     27
Name: label, dtype: int64