# DistillBERT Pre-trained Model

## Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
df = pd.read_csv("./train/train.csv")
plt.figure(figsize=(6, 7))
df['author'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title("Distribution of Authors in Training Set")
plt.show()

## DistilBERT Pipeline and Import

In [None]:
# import torch
# from transformers import pipeline
# device = 0 if torch.cuda.is_available() else -1
# if torch.cuda.is_available():
#     print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")

# classifier = pipeline(
#     task="text-classification",
#     model="distilbert-base-uncased-finetuned-sst-2-english",
#     dtype=torch.float16,
#     device=device,
# )

# result = classifier("I love using Hugging Face Transformers!")
# print(result)
# # Output: [{'label': 'POSITIVE', 'score': 0.9998}]

In [None]:
from transformers import DistilBertConfig, DistilBertModel

# Initializing a DistilBERT configuration
configuration = DistilBertConfig()

# Initializing a model (with random weights) from the configuration
model = DistilBertModel(configuration)

# Accessing the model configuration
configuration = model.config

In [None]:
# DistilBERT tokenization, dataset prep, Trainer pipeline
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from datasets import Dataset
from transformers import pipeline
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer

# Read CSV and prepare labels
df = pd.read_csv('./train/train.csv')
# map authors to integer labels (stable ordering)
authors = sorted(df['author'].unique())
label_map = {a:i for i,a in enumerate(authors)}
df['label'] = df['author'].map(label_map)

# small train/validation split (stratified)
train_df, val_df = train_test_split(df[['text','label']], test_size=0.1, stratify=df['label'], random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

# Convert to Hugging Face Dataset objects
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

# Tokenizer (fast) - will download from Hugging Face if not cached
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_batch(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)

# Tokenize datasets (batched)
train_ds = train_ds.map(tokenize_batch, batched=True)
val_ds = val_ds.map(tokenize_batch, batched=True)

# Set format for PyTorch Trainer
cols = ['input_ids','attention_mask','label']
train_ds.set_format(type='torch', columns=cols)
val_ds.set_format(type='torch', columns=cols)

# Initialize model for sequence classification
num_labels = len(label_map)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

# inverse map for human-readable predictions
inv_label_map = {v:k for k,v in label_map.items()}

# Metrics function
from sklearn.metrics import f1_score, accuracy_score
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Training arguments - adjust epochs / batch sizes as needed
training_args = TrainingArguments(
    output_dir='./distilbert-author-classifier',
    eval_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# To start training, uncomment the following line (requires GPU for speed):
trainer.train()

# After training save the model and tokenizer:
trainer.save_model('./distilbert-author-classifier')
tokenizer.save_pretrained('./distilbert-author-classifier')

# Quick inference example (works after saving or you can use 'model' directly):
device = 0 if torch.cuda.is_available() else -1

# Load saved model path
model_path = './distilbert-author-classifier'
classifier = pipeline('text-classification', model=model_path, tokenizer=tokenizer, device=device)
# Example: print mapped label for a sample sentence
preds = classifier("It was a cold, lonely night and I felt a strange dread.")
if isinstance(preds, list) and 'label' in preds[0]:
    # pipeline returns label names; convert if numeric labels were used
    print(preds)

# Label mapping (for reference):
print(label_map)
print(inv_label_map)


## Load Trained DistilBERT

In [None]:
# Load saved model
model_path = './distilbert-author-classifier'
model = DistilBertForSequenceClassification.from_pretrained(model_path)
