In [None]:
!pip install transformers[torch]
!pip install accelerate -U

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->transformers[torch])
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch
from torch.utils.data import Dataset
from transformers import EarlyStoppingCallback


In [None]:
# Load dataset
file_path = '/content/main_dataframe_extended2.csv'
df = pd.read_csv(file_path)

In [None]:
# Preprocess the labels
label_encoder = LabelEncoder()
df['BIRADS Score'] = label_encoder.fit_transform(df['BIRADS Score'])

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Content'].tolist(), df['BIRADS Score'].tolist(), test_size=0.125, random_state=42
)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-uncased')

# Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors='pt'
        )
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(label)
        return item

# Create datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
test_dataset = CustomDataset(test_texts, test_labels, tokenizer)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-uncased', num_labels=len(set(df['BIRADS Score'])))

# Set up the Trainer
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
)

data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

print("Evaluation results:", results)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.535824
2,No log,0.232722
3,No log,0.190564
4,No log,0.241802
5,0.376600,0.235541
6,0.376600,0.149102
7,0.376600,0.142723
8,0.376600,0.136295
9,0.010700,0.13078
10,0.010700,0.133449


Evaluation results: {'eval_loss': 0.13078035414218903, 'eval_runtime': 4.3781, 'eval_samples_per_second': 62.584, 'eval_steps_per_second': 4.111, 'epoch': 13.0}


In [None]:
import numpy as np
from sklearn.metrics import classification_report

# Get predictions
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Convert labels to strings (in case they are not)
class_labels = [str(label) for label in label_encoder.classes_]

# Generate classification report
report = classification_report(test_labels, preds, target_names=class_labels)
print("Classification Report:\n", report)


Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        33
           2       0.98      1.00      0.99        46
           3       0.98      1.00      0.99        54
           4       0.97      0.97      0.97        72
           5       1.00      0.97      0.99        69

    accuracy                           0.99       274
   macro avg       0.99      0.99      0.99       274
weighted avg       0.99      0.99      0.99       274



In [None]:
# Save the model and tokenizer
model.save_pretrained('./saved_model_main')
tokenizer.save_pretrained('./saved_model_main')

('./saved_model_main/tokenizer_config.json',
 './saved_model_main/special_tokens_map.json',
 './saved_model_main/vocab.txt',
 './saved_model_main/added_tokens.json')

In [None]:
pip install transformers huggingface_hub



In [None]:
!huggingface-cli login --token=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

trainer.model.push_to_hub("AAkay/model_son")
trainer.tokenizer.push_to_hub("AAkay/model_son")
print("It was published :)")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

It was published :)
