# Mistral LLM

In [9]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset #to convert to Hugg

In [1]:
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-q8_hulh5/unsloth_64c2979f20cd4e21b925bb7771032075
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-q8_hulh5/unsloth_64c2979f20cd4e21b925bb7771032075
  Resolved https://github.com/unslothai/unsloth.git to commit bb112e38ef3f0dafa9e87faf55a6ba7499bd0357
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
data = pd.read_csv("sampled_data.csv")

In [3]:
# Imports Unsloth’s optimized class
from unsloth import FastLanguageModel
# Set the maximum tokens that the model can handle
max_seq_length = 2048

# From FastLanguageModel return the model and the tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.50.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

In [4]:
# LoRa (Low Rank Adaptation) - Prepare the model for fine tuning
# we update only some of the model's parameters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, #rank of the LoRa adapters
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], #the attention layers to which to apply LoRa
    lora_alpha = 16, #scaling factor
    lora_dropout = 0, #dropout inside LoRa layers
    bias = "none", #whether to train bias terms
    use_gradient_checkpointing = "unsloth",
    random_state = 3407, #for reproducibility
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Data Preparation

In [7]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
train_df.head()

Unnamed: 0,movie_id,plot_summary,duration,genre,rating_x,release_date,plot_synopsis,review_date,user_id,is_spoiler,review_text,rating_y,review_summary,whole_review
3941,tt1074638,When Bond's latest assignment goes gravely wro...,2h 23min,"['Action', 'Adventure', 'Thriller']",7.8,2012-11-09,"The film begins in Istanbul, where James Bond ...",22 November 2012,ur37965905,True,Mendes so meticulously crafts scenes to explor...,3,Batman NOT Bond,Mendes so meticulously crafts scenes to explor...
1897,tt0134847,"The space transport vessel ""Hunter-Gratzner"" c...",1h 49min,"['Horror', 'Sci-Fi']",7.1,2000-02-18,In the distant future in another part of the g...,31 July 2001,ur0762705,True,(Might contain minor spoilers.)Watched this on...,6,An OK Sci-Fi Horror Flick (if you're into that...,(Might contain minor spoilers.)Watched this on...
2229,tt0212346,Undercover FBI agent Gracie Hart shows no sign...,1h 49min,"['Action', 'Comedy', 'Crime']",6.2,2000-12-22,The film opens at a school where a boy is pick...,2 July 2009,ur0819382,True,"I guess ""unpretentious"" is as good a way as an...",6,Modest comedy.,"I guess ""unpretentious"" is as good a way as an..."
4299,tt1389072,"""Downsizing"" follows a kindly occupational the...",2h 15min,"['Comedy', 'Drama', 'Sci-Fi']",5.8,2017-12-22,The film opens at a research facility in Berge...,22 December 2017,ur69980801,True,"If the environment... or rather, The Environme...",1,Not What It Promises to Be as an Obvious Agend...,"If the environment... or rather, The Environme..."
2868,tt0364725,White Goodman (Ben Stiller) is the owner and f...,1h 32min,"['Comedy', 'Sport']",6.7,2004-06-18,The movie opens with an infomercial for Globo ...,22 April 2014,ur6918917,True,"Vince Vaughn, Ben Stiller, Christine Taylor, J...",6,"""Good Comedy!""","Vince Vaughn, Ben Stiller, Christine Taylor, J..."


In [12]:
# Define prompt template
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# EOS token
EOS_TOKEN = tokenizer.eos_token

# Convert pandas DataFrame to HuggingFace Dataset
train_ds = Dataset.from_pandas(train_df)

# Define a function that adds instruction, input, and output columns to our dataframe
def add_instruction(df):
    df["instruction"] = "Classify the following movie review as either 'spoiler' or 'non-spoiler'."
    df["input"] = df["whole_review"]
    df["output"] = df["is_spoiler"]
    return df


# Apply the add_instruction function to our dataframe
train_df = train_df.map(add_instruction)

# Convert dataset into prompt format
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts }

train_df = train_df.map(formatting_prompts_func, batched=True)

# View sample output
print(train_df[0]["text"])

TypeError: 'str' object does not support item assignment