In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
%pip install datasets

In [3]:
from datasets import load_dataset
xdf = load_dataset('csv', data_files="/content/drive/MyDrive/Scripts/30 Days Of Bert/Day06/train.csv")
xdf

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16800
    })
})

In [4]:
import re

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [5]:
def conver2pandas(data):
    data.set_format(type="pandas")
    df = data["train"][:]
    return df


xpandas = conver2pandas(xdf)
xpandas

Unnamed: 0,text,label
0,"rt @user olha quem chegouuuuu, nossos queridin...",0
1,veio umas teorias muito loucas na minha cabeça...,1
2,@user @user 😂😂😂😂mais nao tinha falado ontem qu...,0
3,rt @user quer ser filha da puta logo comigo qu...,1
4,vai besta 😂😂😂😂 casquei com a ultima foto,1
...,...,...
16795,performer da nação caralho,1
16796,"vôlei feminino é foda né, pqp, só vem tóquio",1
16797,"@user cara de pau, quem desrespeita a constitu...",1
16798,duas das grandes atletas do frescobol mundial....,0


In [6]:
xpandas['text'] = xpandas['text'].apply(preprocessor)
xpandas

Unnamed: 0,text,label
0,rt user olha quem chegouuuuu nossos queridinho...,0
1,veio umas teorias muito loucas na minha cabeça...,1
2,user user mais nao tinha falado ontem que não...,0
3,rt user quer ser filha da puta logo comigo que...,1
4,vai besta casquei com a ultima foto,1
...,...,...
16795,performer da nação caralho,1
16796,vôlei feminino é foda né pqp só vem tóquio,1
16797,user cara de pau quem desrespeita a constitui...,1
16798,duas das grandes atletas do frescobol mundial ...,0


In [7]:
import datasets
xpandas = datasets.Dataset.from_pandas(xpandas)

In [8]:
xpandas = xpandas.train_test_split(test_size=0.1)
xpandas

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 15120
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1680
    })
})

In [10]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
!nvidia-smi

Tue Feb 11 23:10:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8             11W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [11]:
from transformers import BertTokenizer
from transformers import AutoTokenizer

model_ckpt="google-bert/bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [13]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt")

xpandas = xpandas.map(tokenize, batched=True, batch_size=None)
xpandas

Map:   0%|          | 0/15120 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15120
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1680
    })
})

In [14]:
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoModel

model_ckpt = "bert-base-uncased"
num_labels = 2

model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}


In [45]:
from transformers import Trainer, TrainingArguments
batch_size = 64

logging_steps = len(xdf["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=5,
                                  learning_rate=1e-10,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.001,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level="error")



In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=xpandas["train"],
                  eval_dataset=xpandas["test"],
                  tokenizer=tokenizer)
trainer.train();

  trainer = Trainer(model=model, args=training_args,


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.621474,0.724405,0.724467


* Huh: Why I'm getting `NO LOG` on the Training Loss?
* Aha: Logging Steps?

In [37]:
trainer.evaluate(eval_dataset=xpandas["test"])

{'eval_loss': 0.621474027633667,
 'eval_accuracy': 0.7244047619047619,
 'eval_f1': 0.7244671452533932,
 'eval_runtime': 11.9971,
 'eval_samples_per_second': 140.034,
 'eval_steps_per_second': 2.251,
 'epoch': 10.0}

In [38]:
from datasets import load_dataset

val = load_dataset('csv', data_files="/content/drive/MyDrive/Scripts/30 Days Of Bert/Day10/test (4).csv")
val = val.remove_columns(["id"])
val

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 4200
    })
})

In [39]:
val_enc = val.map(tokenize, batched=True, batch_size=None)
val_enc

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4200
    })
})

In [40]:
y_hat = trainer.predict(val_enc['train'])
y_hat

PredictionOutput(predictions=array([[ 1.103897  , -1.3144125 ],
       [ 1.1091374 , -1.2743248 ],
       [ 0.59915197, -0.8065247 ],
       ...,
       [ 1.7111828 , -1.9566948 ],
       [ 1.5077775 , -1.5601753 ],
       [-0.5577407 ,  0.8461161 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 44.2704, 'test_samples_per_second': 94.872, 'test_steps_per_second': 1.491})

In [41]:
import numpy as np

y_hat_argmax = np.argmax(y_hat.predictions, axis=1)
y_hat_argmax

array([0, 0, 0, ..., 0, 0, 1])

In [43]:
import pandas as pd
final = pd.read_csv("/content/sample_submission.csv")
final['label'] = y_hat_argmax
final

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
4195,4195,0
4196,4196,1
4197,4197,0
4198,4198,0


In [44]:
final.to_csv('out2.csv', index=False)