# Dependancies

In [1]:
from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)
from datasets import load_dataset, DatasetDict, Dataset
from transformers import BertForSequenceClassification

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, PeftModelForSequenceClassification
import evaluate
import torch
import numpy as np

import pandas as pd
import numpy as np
from tensorflow.keras.layers import TextVectorization, Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras import Model, Input
from keras.callbacks import EarlyStopping,ModelCheckpoint
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import re
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from transformers import TextClassificationPipeline
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from accelerate import Accelerator
from torch.utils.data.dataloader import DataLoader
from peft import LoraConfig, TaskType
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from transformers import pipeline, AutoModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline
import pandas as pd
import numpy as np
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm
2024-09-23 11:28:18.797556: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-23 11:28:19.824671: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-23 11:28:20.343895: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-23 11:28:20.419660: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-23 11:28:21.0

Using device: cpu


# Make Data Objects

In [2]:
MODEL_NAME = "bert-base-uncased"
##distilbert-base-uncased
##bert-base-uncased

In [17]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

In [3]:
dataset = load_dataset('csv', data_files={'train': 'Cleaned_Datasets/train.csv', 'test': 'Cleaned_Datasets/test.csv'})

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("generated", "labels")
tokenized_datasets.set_format("torch")

BATCH_SIZE = 32

train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=BATCH_SIZE)
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=BATCH_SIZE)

In [19]:
tokenized_datasets["train"]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 175145
})

In [None]:
for_gpu_train_df = tokenized_datasets["train"]
for_gpu_train_df.save_to_disk(f"train_gpu_{MODEL_NAME}.hf")

# Finetune Model

In [None]:
id2label = {0: "Human", 1: "LLM"}
label2id = {"Human": 0, "LLM": 1}
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

peft_config = LoraConfig(target_modules='all-linear', 
                         task_type=TaskType.SEQ_CLS, 
                         inference_mode=False, 
                         r=2, 
                         lora_alpha=32, 
                         lora_dropout=0.1)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
optimizer = AdamW(model.parameters(), lr=5e-5)

EPOCHS = 2
num_training_steps = EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")
model.to(device)

# Training

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(EPOCHS):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

# Testing

In [6]:
model.eval()
metric = evaluate.load("accuracy")
all_predictions = []
all_probabilities = []

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=-1)
    predictions = torch.argmax(logits, dim=-1)
    
    # Add batch to metric
    metric.add_batch(predictions=predictions, references=batch["labels"])

    # Store predictions and probabilities
    all_predictions.extend(predictions.cpu().numpy())
    all_probabilities.extend(probabilities.cpu().numpy())

# Compute the final metric
metric_result = metric.compute()
print(metric_result)


for pred, prob in zip(all_predictions, all_probabilities):
    print(f"Prediction: {pred}, Probability: {prob}")

{'accuracy': 0.9798339264531435}
Prediction: 0, Probability: [0.99483913 0.00516087]
Prediction: 0, Probability: [9.9958640e-01 4.1364584e-04]
Prediction: 1, Probability: [0.00281514 0.9971848 ]
Prediction: 1, Probability: [0.00387453 0.9961255 ]
Prediction: 0, Probability: [9.9987650e-01 1.2352972e-04]
Prediction: 1, Probability: [5.795084e-04 9.994205e-01]
Prediction: 0, Probability: [0.9888416  0.01115844]
Prediction: 0, Probability: [9.9931467e-01 6.8535615e-04]
Prediction: 0, Probability: [9.9962366e-01 3.7635828e-04]
Prediction: 0, Probability: [0.9921071  0.00789291]
Prediction: 0, Probability: [9.9963057e-01 3.6943622e-04]
Prediction: 0, Probability: [0.99561334 0.00438661]
Prediction: 1, Probability: [0.00322308 0.9967769 ]
Prediction: 0, Probability: [9.9920624e-01 7.9379912e-04]
Prediction: 1, Probability: [0.00275625 0.99724376]
Prediction: 0, Probability: [0.9960098  0.00399019]
Prediction: 1, Probability: [0.46144733 0.5385527 ]
Prediction: 0, Probability: [9.9986017e-01 

# Save

In [4]:
#model.save_pretrained(f'Finetuned_Models/{MODEL_NAME}_fine_tuned_model_two', "lora_adapter", save_adapter=True, save_config=True, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.save_pretrained(f'Tokenizers/{MODEL_NAME}_tokenizer')

('Tokenizers/bert-base-uncased_tokenizer/tokenizer_config.json',
 'Tokenizers/bert-base-uncased_tokenizer/special_tokens_map.json',
 'Tokenizers/bert-base-uncased_tokenizer/vocab.txt',
 'Tokenizers/bert-base-uncased_tokenizer/added_tokens.json',
 'Tokenizers/bert-base-uncased_tokenizer/tokenizer.json')

# Load

In [7]:
MODEL_NAME = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.save_pretrained(f'Tokenizers/{MODEL_NAME}_tokenizer')

In [8]:
id2label = {0: "Human", 1: "LLM"}
label2id = {"Human": 0, "LLM": 1}
peft_config = LoraConfig(target_modules='all-linear', 
                         task_type=TaskType.SEQ_CLS, 
                         inference_mode=False, 
                         r=2, 
                         lora_alpha=32, 
                         lora_dropout=0.1)

BASE_MODEL = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)
COMPLETE_MODEL = PeftModel.from_pretrained(BASE_MODEL, f'Finetuned_Models/GPU_Complete_{MODEL_NAME}_fine_tuned_model', config=peft_config)
COMPLETE_MODEL = COMPLETE_MODEL.merge_and_unload()
COMPLETE_MODEL.save_pretrained(f'Complete_Models/GPU_{MODEL_NAME}_complete')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
