In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy
    !pip install --no-deps git+https://github.com/mmathew23/unsloth-zoo.git@t4mixed
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
!pip install --upgrade datasets

In [None]:
%env UNSLOTH_RETURN_LOGITS=1 # Run this to disable CCE since it is not supported for CPT

%env TRANSFORMERS_VERBOSITY=info

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 512
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,

)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",

    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

In [None]:
from textblob import TextBlob
import nltk
import os
from google.colab import drive


nltk.download('punkt_tab')

drive.mount('/content/drive')

folder_path = "path"


sentences = []


for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        blob = TextBlob(text)
        sentences.extend(blob.sentences)


print(f"Collected {len(sentences)} total sentences.")
print(sentences[:1])


In [None]:

chunks = []
for s in range(len(sentences) - 3):
  chunks.append(" ".join([str(sent) for sent in sentences[s:s+3]]))

chunks[900]

In [None]:

import pandas as pd
df = pd.DataFrame(chunks, columns=['text'])
df

In [None]:

from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset[900]

Print out 5 stories from the dataset

In [None]:
for row in dataset[:5]["text"]:
    print("=========================")
    print(row)

Tokenize it!

In [None]:
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    return { "text" : [example + EOS_TOKEN for example in examples["text"]] }
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,

        warmup_ratio = 0.1,
        num_train_epochs = 3,

        learning_rate = 2e-5,
        embedding_learning_rate = 5e-6,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,

        optim = "adamw_torch_fused",
        weight_decay = 0.00,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
)
)

In [None]:
trainer_stats = trainer.train()

In [None]:
from transformers import TextIteratorStreamer
from threading import Thread
import textwrap
max_print_width = 50


FastLanguageModel.for_inference(model)

inputs = tokenizer(
[
    "Foreman we rush"
]*1, return_tensors = "pt").to("cuda")

generation_kwargs = dict(
    inputs,
    max_new_tokens = 256,
    use_cache = True,
    do_sample=True,
    temperature=1.0,
    top_k = 50,
)

outputs = model.generate(**generation_kwargs)
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for text in generated_text:
    wrapped_text = textwrap.fill(text, width=max_print_width)
    print(wrapped_text)

## Save the model as a GGUF

hopefully...

In [None]:
model.push_to_hub("path", token = "token")
tokenizer.push_to_hub("path", token = "token")