<a href="https://www.kaggle.com/code/josebambora/mistral-sentimental-analysis?scriptVersionId=176212028" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install -q -U torch=='2.0.0'

In [2]:
!pip install -q -U accelerate=='0.25.0' peft=='0.7.1' bitsandbytes=='0.41.3.post2' transformers=='4.36.1' trl=='0.7.4'

In [3]:
from huggingface_hub import notebook_login
import os
import warnings
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset, concatenate_datasets
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import accuracy_score
from datasets import load_dataset
import re
import requests
import gzip
import shutil

2024-05-07 10:41:57.832391: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-07 10:41:57.832480: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-07 10:41:57.995166: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")

# Prepare the data

- Prepare the IMDb dataset.
- Format the dataset to facilitate model learning by appending [INST] at the start and [/INST] at the end of each sentence.
- Allocate 900 cases for training, 100 for validation, and 1000 for testing.

In [6]:
def save_data():
    url = "https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz"
    filename = url.split("/")[-1]

    with open(filename, "wb") as f:
        r = requests.get(url)
        f.write(r.content)

    with gzip.open('movie_data.csv.gz', 'rb') as f_in:
        with open('movie_data.csv', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
save_data()

In [7]:
def generate_prompt(data_point):
    label = 'positive'
    if data_point["label"] != 1:
        label = 'negative'
    res = f"""
            [INST]Analyze the sentiment of the movie review enclosed in square brackets,
            determine if it is positive, or negative, and return the answer as
            the corresponding sentiment label "positive" or "negative"[/INST]

            [{data_point["text"]}] = {label}""".strip()
    return re.sub(r'\s+', ' ', res)

def generate_test_prompt(data_point):
    res = f"""
            [INST]Analyze the sentiment of the movie review enclosed in square brackets,
            determine if it is positive, or negative, and return the answer as
            the corresponding sentiment label "positive" or "negative"[/INST]

            [{data_point}] = """.strip()
    return re.sub(r'\s+', ' ', res)

In [8]:
random_seed = 2000

def select(data,label_result,range_num):
    return data.filter(lambda example: example['label'] == label_result).shuffle(seed=random_seed).select(range(range_num))

def generate_data(data):
    return data.shuffle(seed=random_seed).map(lambda elem : {'text': generate_prompt(elem)})

In [23]:
def prepare_data_train(imdb):
    positive_rows = select(imdb['train'],1,500)
    negative_rows = select(imdb['train'],0,500)
    
    positive_rows_train = positive_rows.select(indices=range(450))
    negative_rows_train = negative_rows.select(indices=range(450))
    positive_rows_eval  = positive_rows.select(indices=range(450, 500))
    negative_rows_eval  = negative_rows.select(indices=range(450, 500))
    
    selected_rows_train = concatenate_datasets([positive_rows_train, negative_rows_train])
    selected_rows_eval  = concatenate_datasets([positive_rows_eval, negative_rows_eval])
    
    data_train = generate_data(selected_rows_train)
    data_eval  = generate_data(selected_rows_eval)
    return data_train,data_eval

def prepare_data_test(df):
    X_test = df.iloc[40000:42500]
    X_test['text'] = X_test['review']
    X_test['text'] = X_test['text'].apply(lambda x: generate_test_prompt(x))
    y_true = list(X_test['sentiment'])
    return X_test.drop(['sentiment','review'],axis=1), y_true

def prepare_data():
    df = pd.read_csv('movie_data.csv')
    imdb = load_dataset('imdb')
    data_train, data_eval = prepare_data_train(imdb)
    X_test, y_true = prepare_data_test(df)
    return data_train, data_eval, X_test, y_true

In [24]:
data_train, data_eval, X_test, y_true = prepare_data()

In [11]:
# Debug Messages, uncomment if necessary
# print(data_train)
# print(data_eval)
# print(data_train[0]['text'])
# print(X_test.info())
# print(y_true)

# Functions for Model Evaluation

- accuracy_for_label: Computes accuracy for positive or negative reviews.
- evaluate: Computes overall accuracy and accuracy for each label using the previous function.

In [12]:
def accuracy_for_label(y_true, y_pred, label):
    label_indices = [i for i, y in enumerate(y_true) if y == label]
    label_y_true = [y_true[i] for i in label_indices]
    label_y_pred = [y_pred[i] for i in label_indices]
    return accuracy_score(label_y_true, label_y_pred)

def evaluate(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Overall Accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Accuracy for each label
    accuracy_negative = accuracy_for_label(y_true,y_pred,0)
    accuracy_positive = accuracy_for_label(y_true,y_pred,1)

    print(f'Accuracy for negative reviews: {accuracy_negative:.3f}')
    print(f'Accuracy for positive reviews: {accuracy_positive:.3f}')

# Functions for Answer Generation

- generate_response: Retrieves the model's response.
- predict: Invokes generate_response for each test case.

In [13]:
def generate_response(prompt,model,tokenizer):
    pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens = 1,
                        temperature = 0.0)
    result = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)
    return result[0]['generated_text'].split("=")[-1].lower()

def predict(X_test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        answer = generate_response(prompt,model,tokenizer)
        if "positive" in answer:
            y_pred.append(1)
        else:
            y_pred.append(0)
    return y_pred

# Mistral Workflow

- Obtain version Mistral-7B-Instruct-v0.2.
- Assess the performance of the base model.
- Train the model using our data.
- Evaluate the retrained model.

In [14]:
def get_model():
    model_name = "mistralai/Mistral-7B-Instruct-v0.2"
    compute_dtype = getattr(torch, "float16")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=bnb_config,
    )
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True,
                                              padding_side="left",
                                              add_bos_token=True,
                                              add_eos_token=True,
                                            )
    tokenizer.pad_token = tokenizer.eos_token
    return (model,tokenizer)

In [15]:
model,tokenizer = get_model()

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [16]:
# Base Model Performance. Since this evaluation takes too much time, it is in comments, but uncomment if necessary.
# y_pred = predict(X_test, model, tokenizer)
# evaluate(y_true, y_pred)

In [17]:
def train_configuration():
    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
    )

    training_arguments = TrainingArguments(
        output_dir="mistral_retrained",
        num_train_epochs=4,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_32bit",
        save_steps=0,
        logging_steps=25,
        learning_rate=2e-4,
        weight_decay=0.001,
        fp16=True,
        max_grad_norm=0.3,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="cosine",
        report_to="tensorboard",
        evaluation_strategy="epoch"
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=data_train,
        eval_dataset=data_eval,
        peft_config=peft_config,
        dataset_text_field="text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
        max_seq_length=512,
    )
    return trainer

In [18]:
trainer = train_configuration()
trainer.train()

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.0486,2.06951
2,1.9664,2.07854
3,1.8297,2.104064
4,1.7339,2.127548


TrainOutput(global_step=900, training_loss=1.9081769222683376, metrics={'train_runtime': 5169.7089, 'train_samples_per_second': 0.696, 'train_steps_per_second': 0.174, 'total_flos': 5.055962222051328e+16, 'train_loss': 1.9081769222683376, 'epoch': 4.0})

In [25]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

100%|██████████| 2500/2500 [1:20:31<00:00,  1.93s/it]

Accuracy: 0.964
Accuracy for negative reviews: 0.970
Accuracy for positive reviews: 0.959





In [26]:
trainer.push_to_hub()

adapter_model.safetensors:   0%|          | 0.00/109M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.28k [00:00<?, ?B/s]

events.out.tfevents.1715078702.2f49363661b3.34.0:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/JoseBambora/mistral_retrained/commit/2cbdfb783459feb96c26ffd9057a7aa75ae4eb64', commit_message='End of training', commit_description='', oid='2cbdfb783459feb96c26ffd9057a7aa75ae4eb64', pr_url=None, pr_revision=None, pr_num=None)