In [None]:
import os
import torch
import pandas as pd
import evaluate
import numpy as np
import seaborn as sns
import wandb
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig, TrainingArguments, Trainer, pipeline
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from datetime import datetime
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from dotenv import load_dotenv
from huggingface_hub import login
from wandb.sdk.verify.verify import PROJECT_NAME

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
torch.cuda.empty_cache()

In [None]:
load_dotenv()

## Fine Tune Bert for Sentiment Classification


In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/twitter_multi_class_sentiment.csv")

In [None]:
PROJECT_NAME = "TwitterClassification"
RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
HF_USER = "CharlesMac"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

EPOCHS = 3
LEARNING_RATE = 2e-5
LOG_INTO_WANDB = True

### Data Analysis

In [None]:
df.info()

In [None]:
df['label'].value_counts()

In [None]:
label_counts = df['label_name'].value_counts(ascending=True)
label_counts.plot.barh()
plt.title("Frequency of Classes")
plt.ylabel("Classes")
plt.show()

In [None]:
df['Words per Tweet'] = df['text'].str.split().apply(len)
df.boxplot('Words per Tweet', by='label_name')

### Tokenization

In [None]:
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
text = "I love machine learning! Tokenization is awesome!"
encoded_input = tokenizer(text)
print(encoded_input)

In [None]:
len(tokenizer.vocab), tokenizer.vocab_size, tokenizer.model_max_length

### Data Loader and Train Test Split

In [None]:
train, test = train_test_split(df, test_size=0.3, stratify=df['label_name'], random_state=42)
test, validation = train_test_split(test, test_size=1/3, stratify=test['label_name'])

train.shape, test.shape, validation.shape

In [None]:
dataset = DatasetDict (
    {'train':Dataset.from_pandas(train, preserve_index=False),
     'test':Dataset.from_pandas(test, preserve_index=False),
    'validation':Dataset.from_pandas(validation, preserve_index=False)
})


In [None]:
dataset

In [None]:
dataset['train'][122]

In [None]:
def tokenize(batch):
    tokens = tokenizer(batch['text'], padding=True, truncation=True)
    return tokens

print(tokenize(dataset['train'][0]))

In [None]:
emotion_encoded = dataset.map(tokenize, batched=True, batch_size=None)
emotion_encoded

In [None]:
# label2id, id2label
label2id = {x['label_name']:x['label'] for x in dataset['train']}
id2label = {v:k for k,v in label2id.items()}

In [None]:
# label2id
id2label

## Model Building

In [None]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

In [None]:
wandb_api_key = os.environ['WANDB_API_KEY']
os.environ["WANDB_API_KEY"] = wandb_api_key
wandb.login()
#
# # Configure Weights & Biases to record against our project
os.environ["WANDB_PROJECT"] = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"] = "checkpoint" if LOG_INTO_WANDB else "end"
os.environ["WANDB_WATCH"] = "gradients"

In [None]:
if LOG_INTO_WANDB:
    wandb.init(project=PROJECT_NAME, name=RUN_NAME)

In [None]:
model = AutoModel.from_pretrained(model_ckpt)

In [None]:
model

In [None]:
num_labels = len(label2id)
config = AutoConfig.from_pretrained(model_ckpt, label2id=label2id, id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

In [None]:
model.config

In [None]:
BATCH_SIZE = 64
TRAINING_DIR = "bert_base_training_dir"

training_args = TrainingArguments(output_dir=TRAINING_DIR,
                                  overwrite_output_dir=True,
                                  num_train_epochs=EPOCHS,
                                  learning_rate=LEARNING_RATE,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  report_to="wandb" if LOG_INTO_WANDB else None,
                                  run_name=RUN_NAME,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  disable_tqdm=False,
                                  hub_model_id=HUB_MODEL_NAME,
                                  hub_private_repo=True
                                  )

### Build compute metrics function

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return accuracy.compute(predictions=predictions, reference=labels)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

## Build Trainer

In [None]:
trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotion_encoded['train'],
                  eval_dataset=emotion_encoded['validation'],
                  tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
trainer.save_model("bert-base-uncased-sentiment1")

## Model Evaluation

In [None]:
preds_output = trainer.predict(emotion_encoded['test'])
preds_output.metrics

In [None]:
y_pred = np.argmax(preds_output.predictions, axis=1)
y_true = emotion_encoded['test'][:]['label']

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
label2id

In [None]:
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize = (5,5))
sns.heatmap(cm, annot=True, xticklabels=label2id.keys(), yticklabels=label2id.keys(), fmt='d', cbar=False, cmap='Reds')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()


## Build Prediction Function and Store Model

In [None]:
text = "I am super happy today. I got it done, finally"




In [None]:
def get_prediction(text):
    input_encoded = tokenizer(text, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**input_encoded)

    logits = outputs.logits
    pred = torch.argmax(logits, dim=-1).item()
    return id2label[pred]


In [None]:
# print prediction
get_prediction(text)


In [None]:
# use pipline for prediction

classifier = pipeline("text-classification", model="bert-base-uncased-sentiment")
classifier([text, "I hate you!", "You are the apple of my eye!"])

In [None]:
if LOG_INTO_WANDB:
    wandb.finish()