In [None]:
import unsloth
from unsloth import FastLanguageModel
import torch
max_seq_length = 11000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
from transformers import AutoTokenizer

import os

# Set the directory containing the text files
DIR = "Byrons_Data/tfidf/train"#"FINALFALSE"

# Define the instruction
INSTRUCTION = "For the given data from the court decision, determine whether the principle of proportionality strictu sensu was applied or not."

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct-bnb-4bit")

# Create the vector list
applied_string = "proportionality principle strictu sensu applied"
not_applied_string = "proportionality principle strictu sensu not applied"
vector = [not_applied_string] * 80 + [applied_string] * (80) #80 each i think


# Loop through the first 173 files in the directory
for i, filename in enumerate(sorted(os.listdir(DIR))[:160]):#Change this for diff. data file size
    file_path = os.path.join(DIR, filename)

    # Skip directories
    if os.path.isdir(file_path):
        continue

    with open(file_path, "r") as file:
        story = "".join(file.readlines())
        summary = vector[i]

    # Count tokens
    instruction_tokens = tokenizer(INSTRUCTION, return_tensors="pt")["input_ids"].shape[1]
    story_tokens = tokenizer(story, return_tensors="pt")["input_ids"].shape[1]
    summary_tokens = tokenizer(summary, return_tensors="pt")["input_ids"].shape[1]
  
    # Print table of tokens
    total_tokens = instruction_tokens + story_tokens + summary_tokens
    # print(f"{total_tokens:<12}{instruction_tokens:<12}{story_tokens:<12}{summary_tokens:<12}")


In [None]:
from datasets import Dataset

import pandas as pd

import os

# Set the directory containing the text files
DIR = "Byrons_Data/tfidf/train"#"FINALFALSE"

INSTRUCTION = "For the given data from the court decision, determine whether the principle of proportionality strictu sensu was applied or not."


df_instructions = pd.DataFrame(columns=['text'])
df_stories = pd.DataFrame(columns=['text'])
df_summaries = pd.DataFrame(columns=['text'])

def combine_texts(instruction, story, summary):
  return {
      "text": f"""
{instruction}

### Story
{story}

### Summary
{summary}
"""}


# Loop through the first 173 files in the directory
for i, filename in enumerate(sorted(os.listdir(DIR))[:160]):
    file_path = os.path.join(DIR, filename)

    # Skip directories
    if os.path.isdir(file_path):
        continue

    with open(file_path, "r") as file:
        story = "".join(file.readlines())
        summary = vector[i] 
    
    df_instructions = pd.concat(
    [df_instructions, pd.DataFrame([{'text': INSTRUCTION}])],
    ignore_index=True
    )
    df_stories = pd.concat(
    [df_stories, pd.DataFrame([{'text': story}])],
    ignore_index=True
    )
    df_summaries = pd.concat(
    [df_summaries, pd.DataFrame([{'text': summary}])],
    ignore_index=True
    )

combined_texts = [combine_texts(instruction, story, summary) for instruction, story, summary in zip(df_instructions["text"], df_stories["text"], df_summaries["text"])]

finetuning_dataset = Dataset.from_dict({"text": [ct["text"] for ct in combined_texts]})
# finetuning_dataset[15]['text']

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = finetuning_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,#reduce for memory alloc
    dataset_num_proc = 2,#2 or 1,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2, #2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 6, #6, #8
        # max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        # gradient_checkpointing=True,  # Enable if supported by the model

    ),
)
trainer_stats = trainer.train()

In [None]:
import os

FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

# Path to the directory
DIR = "Byrons_Data/TEST_REG"

# Example file path
filename_now = "Big_False_82.txt"
file_path = os.path.join(DIR, filename_now)

# Skip directories
if os.path.isdir(file_path):
    raise Exception(f"{file_path} is a directory, not a file.")

# Read the story text
with open(file_path, "r") as file:
    story_text2 = "".join(file.readlines())

summary = ""

# Assuming combine_texts is a function that combines instruction, story, and summary into a format
example = combine_texts(INSTRUCTION, story_text2, summary)


inputs = tokenizer(
[
    example["text"]
], return_tensors = "pt").to("cuda")

