In [None]:
pip install -q -U "torch==2.1.2" tensorboard
!pip install -q -U "transformers==4.36.2" "datasets==2.16.1" "accelerate==0.26.1" "bitsandbytes==0.42.0"


In [None]:
!pip install -q -U git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e
!pip install -q -U git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset


In [None]:
# Print PyTorch version and set device to CUDA if available, otherwise to CPU
print(f"Using PyTorch version {torch.__version__}")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define file path for data
filename = "../input/sentiment-analysis/all-data.json"

# Read CSV file into a DataFrame, specifying column names and encoding
df = pd.read_csv(filename, names=["sentiment", "text"], encoding="utf-8", encoding_errors="replace")

# Split data into train, test, and evaluation sets for each sentiment category
X_train = []
X_test = []
for sentiment in ["positive", "neutral", "negative"]:
    train, test = train_test_split(df[df.sentiment == sentiment], train_size=300, test_size=300, random_state=42)
    X_train.append(train)
    X_test.append(test)

# Concatenate and shuffle train and test sets
X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

# Select evaluation data
eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval.groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

# Define functions to generate prompts for training and evaluation data
def generate_prompt(data_point):
    return f"Analyze the sentiment of the news headline enclosed in square brackets, determine if it is positive, neutral, or negative, and return the answer as the corresponding sentiment label \"positive\" or \"neutral\" or \"negative\".\n\n[{data_point['text']}] = {data_point['sentiment']}"

def generate_test_prompt(data_point):
    return f"Analyze the sentiment of the news headline enclosed in square brackets, determine if it is positive, neutral, or negative, and return the answer as the corresponding sentiment label \"positive\" or \"neutral\" or \"negative\".\n\n[{data_point['text']}] ="

# Apply prompt generation functions to train, evaluation, and test data
X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), columns=["text"])
y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

# Create datasets from pandas DataFrames
train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)


In [None]:
# Define evaluation function
def evaluate(y_true, y_pred):
    # Define labels and mapping for sentiment categories
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'none': 1, 'negative': 0}

    # Define function to map sentiment labels to numerical values
    def map_func(x):
        return mapping.get(x, 1)

    # Map true and predicted labels to numerical values
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report for each label
    unique_labels = set(y_true)
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {label_accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])

# Define model and tokenizer configurations
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
compute_dtype = getattr(torch, "float16")

# Define configuration for BitsAndBytes quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model with specified configurations
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config,
)

# Disable model caching and set pretraining token probability
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load tokenizer and set padding configurations
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Setup chat format for the model
model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=1,
                        temperature=0.0,
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

# Generate predictions
y_pred = predict(X_test, model, tokenizer)

# Evaluate predictions
evaluate(y_true, y_pred)

# Define output directory for trained weights
output_dir = "trained_weights"

# Configure Lora model
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)

# Define training arguments
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    evaluation_strategy="epoch"
)

# Create trainer
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

# Train model
trainer.train()

# Save trained model and tokenizer
trainer.save_model()
tokenizer.save_pretrained(output_dir)

# Load tensorboard extension
%load_ext tensorboard

# Launch tensorboard for visualization
%tensorboard --logdir logs/runs

# Clean up memory
import gc

del [model, tokenizer, peft_config, trainer, train_data, eval_data]
del [training_arguments, SFTTrainer, LoraConfig, BitsAndBytesConfig]

# Clear GPU memory
for _ in range(100):
    torch.cuda.empty_cache()
    gc.collect()

# Load fine-tuned model using AutoPeftModelForCausalLM
finetuned_model_dir = "./trained_weights/"
compute_dtype = getattr(torch, "float16")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")

finetuned_model = AutoPeftModelForCausalLM.from_pretrained(
     finetuned_model_dir,
     torch_dtype=compute_dtype,
     return_dict=False,
     low_cpu_mem_usage=True,
     device_map=device,
)

# Merge and unload model for efficient inference
merged_model = finetuned_model.merge_and_unload()
merged_model.save_pretrained("./merged_model", safe_serialization=True, max_shard_size="2GB")
tokenizer.save_pretrained("./merged_model")
