# Preprocess Training Data

# 1. Import Required Libraries

In [2]:
import re
import nltk
import json
from textblob import TextBlob
import numpy as np
import faiss

In [1]:
#from peft import LoraConfig, get_peft_model
import os
#os.environ["TRANSFORMERS_NO_TF"] = "1"  # Force PyTorch mode
from transformers import AutoTokenizer, AutoModel
import torch

In [18]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import BitsAndBytesConfig
import bitsandbytes as bnb
from transformers import AutoModelForCausalLM, AutoTokenizer

In [4]:
from datasets import load_dataset
from transformers import TrainingArguments
from transformers import Trainer, DataCollatorForLanguageModeling


In [5]:
#load json file

with open("training_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

print(f"Loaded {len(data)} text files.")

Loaded 12 text files.


# 2. Preprocess Text

prepare text for use in pretrained model

In [6]:
def clean_text(text):
    ''' 
    Remove unwanted formatting, line breaks, and unnecessary symbols.
    Normalize text (convert to lowercase, remove extra spaces).

    Args:
    text: json file with text for training 

    returns: processed text
    '''

    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    text = re.sub(r"[^\w\s,.!?]", "", text)  # Remove special characters
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9\s,.!?;:()\"'-]", "", text)  # Keep standard characters
    
    return text



In [7]:
# Apply preprocessing to all text data
for item in data:
    item["cleaned_content"] = clean_text(item["content"])

print("Preprocessing complete!")

Preprocessing complete!


In [8]:
#preview
slices = data[1:2]

for val in slices:
    print(val['content'])
    print(val['cleaned_content'])

His value unquestioned
His worth immeasurable
Scaloni knew this was the piece to preserve if they were to make the finals
Di Maria was scarcely used in the knockout stages
And for 63 min of the finals 
He left Everything on the field 
A man for the Big Occasion
in '05 U20 World Cup
'08 Olympic Gold
'21 Copa America Champions
'22 World Cup Champions

his value unquestioned his worth immeasurable scaloni knew this was the piece to preserve if they were to make the finals di maria was scarcely used in the knockout stages and for 63 min of the finals he left everything on the field a man for the big occasion in 05 u20 world cup 08 olympic gold 21 copa america champions 22 world cup champions


In [9]:
# Prepare clean text for fine tune training
formatted_data = []
for item in data:
    formatted_data.append({
        "prompt": f"What is the story of {item['file_name'].replace('_', ' ').replace('.txt', '')}?",
        "response": item["cleaned_content"]
    })

In [10]:
#preview
slices = formatted_data[1:2]

for val in slices:
    print(val['prompt'])
    print(val['response'])

What is the story of argentina unsung hero?
his value unquestioned his worth immeasurable scaloni knew this was the piece to preserve if they were to make the finals di maria was scarcely used in the knockout stages and for 63 min of the finals he left everything on the field a man for the big occasion in 05 u20 world cup 08 olympic gold 21 copa america champions 22 world cup champions


In [11]:
# Save as JSONL file
with open("ft_training_data.jsonl", "w") as f:
    for item in formatted_data:
        f.write(json.dumps(item) + "\n")

# 3. Fine Tuning: Peft LoRA technique

Reduces GPU memory usage
Fine-tunes faster
Only trains a small subset of parameters

In [21]:
#assign llama 3.2 to variable name
model_llama_32 = "meta-llama/Llama-3.2-1B"

tokenizer_llama_32 = AutoTokenizer.from_pretrained(model_llama_32)

# Load Model on CUDA
device = "cuda" if torch.cuda.is_available() else "cpu"

# Enable 4-bit quantization to reduce memory usage
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

quantized_model = AutoModelForCausalLM.from_pretrained(model_llama_32, 
                                                       device_map="auto",
                                                       low_cpu_mem_usage=True,
                                                       torch_dtype=torch.bfloat16,
                                                       quantization_config=quantization_config)


In [23]:
lora_config = LoraConfig(
    r=16, #Lora rank: lower = faster, higher = more fine tuning effect
    lora_alpha=16, #scaling factor
    target_modules=["q_proj", "v_proj"], ## Apply LoRA to query and value projection layers
    lora_dropout=0.05, # Dropout for better generalization
    bias="none",
    task_type = TaskType.CAUSAL_LM,
)
    #modules_to_save=["classifier"],
#)

model_lora = get_peft_model(quantized_model,lora_config)
model_lora.print_trainable_parameters()

trainable params: 1,703,936 || all params: 1,237,518,336 || trainable%: 0.1377


In [24]:
#load dataset for fine tuning
ft_dataset = load_dataset("json", data_files="ft_training_data.jsonl", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
#check for correct data structure
print(ft_dataset[0]) 

{'prompt': 'What is the story of achievment unlocked?', 'response': 'he was herald as the successor to maradona he matched his world cup record assists with 8 he outpaced his goal talley with 13 16 the record by klose to which he couldnt be any closer but the child of promise waited 16 years to return glory to argentina 36 years later'}


In [29]:
# Set the pad_token to the EOS token o
tokenizer_llama_32.pad_token = tokenizer_llama_32.eos_token  

# Tokenize dataset
def tokenize_function(text):
    #combined_text = text["prompt"] + "\n" + text["response"] 
    combined_text = [p + "\n" + r for p, r in zip(text["prompt"], text["response"])]  # Concatenate prompt + response 
    return tokenizer_llama_32(combined_text, truncation=True, padding="max_length", max_length=512)

tokenized_datasets = ft_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [None]:
#define training arguments with huggin face trainer

training_args = TrainingArguments(
    output_dir="CreativeAssistant_AI",
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    gradient_accumulation_steps=8,  # Simulates larger batch sizes
    warmup_steps=100,
    max_steps=1000,  # Number of steps (adjust for longer training)
    learning_rate=2e-4,  # LoRA fine-tuning works well with higher LR
    fp16=True,  # Use mixed precision
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    evaluation_strategy="no",  # No validation for now
    report_to="none",
    label_names =["labels"]

)



# 4. Train the Model

In [34]:
# Prepare data collator
data_collator = DataCollatorForLanguageModeling(tokenizer_llama_32, mlm=False)

trainer = Trainer(
    model=model_lora,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer_llama_32,
    data_collator=data_collator,
    )

  trainer = Trainer(


In [35]:
# Train the model
trainer.train()



Step,Training Loss
10,4.3136
20,4.2812
30,4.1807
40,3.9568
50,3.5891
60,3.0521
70,2.3267
80,1.5332
90,0.781
100,0.294


TrainOutput(global_step=1000, training_loss=0.3488978078961372, metrics={'train_runtime': 3253.1277, 'train_samples_per_second': 9.837, 'train_steps_per_second': 0.307, 'total_flos': 3.5936872169472e+16, 'train_loss': 0.3488978078961372, 'epoch': 1000.0})

In [39]:
trainer.save_model("creativeAssistant_AI")

In [37]:
#results = trainer.evaluate()
#print(results) 

In [47]:
from transformers import pipeline

pipe = pipeline("text-generation", model='creativeAssistant_AI',temperature=0.7, top_p=0.4, top_k=30)
prompt = "Tell the story about Marcus Rasford debut Premier league goal?"
output = pipe(prompt, truncation= True, max_length=200)
print(output[0]['generated_text'])

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Tell the story about Marcus Rasford debut Premier league goal? Marcus Rasford was born on 10th May 1994 in the city of London. He is a professional footballer who plays as a striker for the Premier League club Leicester City. He made his debut for the club in the 2014/15 season. He scored his first goal for the club in the 2015/16 season. He scored his first goal for the club in the 2015/16 season. He scored his first goal for the club in the 2015/16 season. He scored his first goal for the club in the 2015/16 season. He scored his first goal for the club in the 2015/16 season. He scored his first goal for the club in the 2015/16 season. He scored his first goal for the club in the 2015/16 season. He scored his first goal for the club in the 2015/


In [46]:
print(output)

[{'generated_text': 'Tell the story about Marcus Rasford debut Premier league goal? How many goals did he score in his first 14 games in the premier league? How many goals did he score in his second 14 games in the premier league? How many goals did he score in his third 14 games in the premier league? How many goals did he score in his fourth 14 games in the premier league? How many goals did he score in his fifth 14 games in the premier league? How many goals did he score in his sixth 14 games in the premier league? How many goals did he score in his seventh 14 games in the premier league? How many goals did he score in his eighth 14 games in the premier league? How many goals did he score in his ninth 14 games in the premier league? How many goals did he score in his tenth 14 games in the premier league? How many goals did he score in his eleventh 14 games in the premier league'}]


## Convert Text to Embeddings 

using bert based transformer: SBERT
does not require manual tokenization of text input
model handles tokenization internally

In [None]:
#from sentence_transformers import SentenceTransformer

#embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

#print("Embedding model loaded!")




In [None]:
#load tokenizer and model from Huggin Face
model_id = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)

In [None]:
def get_embedding(text):
    ''' Generates sentence embeddings using transformer all-MiniLM-L6-v2 '''
    inputs = tokenizer(text,
                       return_tensors ='pt',
                       padding= True,
                       truncation= True
                       )
    
    with torch.no_grad(): 
        outputs = model(**inputs)

    
    #Mean pooling - Take the average of the last hidden state
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

In [None]:
for item in data:
    #text = item.get("content","") #extract content 
    item["embedding"] = get_embedding(item["cleaned_content"])
    

In [None]:
#save embeddings to json file
#with open("text_embeddings.json", "w", encoding="utf-8") as file:
#    json.dump(data,file, indent =4, ensure_ascii= False)

#print("Embeddings saved")


In [None]:
#preview
slices = data[1:2]

for val in slices:
    print(val)

## Store Embeddings in FAISS

Faiss is a library for efficient similarity search and clustering of dense vectors.


In [None]:
#check if faiss installed
try:
    import faiss
    print("FAISS is installed!")
except ImportError:
    print("FAISS is NOT installed.")



installed faiss-gpu in conda terminal Window ; 

conda install conda-forge::faiss-gpu

In [None]:
#creating Faiss index

#import faiss
#import numpy as np

#dimension based on embedding size
embedding_dim = 384 #this is based on expected output size of SBERT model

#create Faiss index with GPU
res = faiss.StandardGpuResources() 
index = faiss.GpuIndexFlatL2(res, embedding_dim) #L2 distance for similarity

#convert embeddings to Faiss format 
embedding_matrix =np.array([item["embedding"] for item in data], dtype =np.float32) #convert to float 32 for Faiss

# Add embeddings to the FAISS index
index.add(embedding_matrix)


# Verify index size
print(f"FAISS index contains {index.ntotal} vectors")

In [None]:
# Save FAISS index
#faiss.write_index(index, "faiss_index.bin")

#print("FAISS index saved!")

In [None]:
# Convert embeddings into a numpy array
embedding_matrix = np.array([item["embedding"] for item in data], dtype=np.float32)

# Create FAISS index (for efficient similarity search)
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embedding_matrix)

# Save FAISS index
faiss.write_index(index, "faiss_index.bin")
