# Scratch_research

In [None]:
# prompt: upgrade to python3.14

!python314 pyenv install 3.14
!python3 -m pip install --user pipx
!pip install -U datasets transformers pandas
!pip install openai
!pyenv activate 3.14
!pip install apache_beam
!pip install accelerate -U

# Generic tuning

In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from datasets import load_dataset
import torch
from torch.utils.data import Dataset

# Configuration class
class FineTuneConfig:
    def __init__(self,
                 data_source,
                 cache_dir,
                 tokenizer_model,
                 pretrained_model_path,
                 output_model_path,
                 training_args,
                 question_column,
                 answer_column,
                 question_prefix,
                 answer_prefix):

        self.data_source = data_source
        self.cache_dir = cache_dir
        self.tokenizer_model = tokenizer_model
        self.pretrained_model_path = pretrained_model_path
        self.output_model_path = output_model_path
        self.training_args = training_args
        self.question_column = question_column
        self.answer_column = answer_column
        self.question_prefix = question_prefix
        self.answer_prefix = answer_prefix

class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = item['input_ids']
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

def tokenize_data(tokenizer,
                  train_data: pd.DataFrame,
                  val_data: pd.DataFrame,
                  question_column: str,
                  answer_column: str,
                  question_prefix: str,
                  answer_prefix: str,
                  max_length: int = 256) -> (CustomDataset, CustomDataset):

    questions = tokenizer([question_prefix + q for q in train_data[question_column].tolist()], truncation=True, padding=True, return_tensors='pt', max_length=max_length)
    answers = tokenizer([answer_prefix + a for a in train_data[answer_column].tolist()], truncation=True, padding=True, return_tensors='pt', max_length=max_length)

    merged_encodings = {
        "input_ids": torch.cat([questions.input_ids, answers.input_ids], dim=-1),
        "attention_mask": torch.cat([questions.attention_mask, answers.attention_mask], dim=-1)
    }
    train_dataset = CustomDataset(merged_encodings)

    questions_val = tokenizer([question_prefix + q for q in val_data[question_column].tolist()], truncation=True, padding=True, return_tensors='pt', max_length=max_length)
    answers_val = tokenizer([answer_prefix + a for a in val_data[answer_column].tolist()], truncation=True, padding=True, return_tensors='pt', max_length=max_length)

    merged_encodings_val = {
        "input_ids": torch.cat([questions_val.input_ids, answers_val.input_ids], dim=-1),
        "attention_mask": torch.cat([questions_val.attention_mask, answers_val.attention_mask], dim=-1)
    }
    val_dataset = CustomDataset(merged_encodings_val)

    return train_dataset, val_dataset

def initialize_model_and_trainer(model_path: str, train_dataset: CustomDataset, val_dataset: CustomDataset, training_args: dict) -> Trainer:
    model = GPT2LMHeadModel.from_pretrained(model_path)
    if torch.cuda.is_available():
        model.to('cuda')

    args = TrainingArguments(**training_args)

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    return trainer

def fine_tune_model(config: FineTuneConfig):
    # Load the Data
    dataset = load_dataset(config.data_source, cache_dir=config.cache_dir)
    train_data = dataset['train'].to_pandas().sample(frac=0.20, random_state=42)
    val_data = dataset['validation'].to_pandas().sample(frac=0.20, random_state=42)

    # Tokenize the Data
    tokenizer = GPT2Tokenizer.from_pretrained(config.tokenizer_model)
    tokenizer.pad_token = tokenizer.eos_token
    train_dataset, val_dataset = tokenize_data(tokenizer,
                                               train_data,
                                               val_data,
                                               config.question_column,
                                               config.answer_column,
                                               config.question_prefix,
                                               config.answer_prefix)

 
    trainer = initialize_model_and_trainer(config.pretrained_model_path, train_dataset, val_dataset, config.training_args)

    trainer.train()
    trainer.model.save_pretrained(config.output_model_path)



# Dialogue Training


In [None]:
# emp
if __name__ == "__main__":
    TRAINING_ARGS = {
        'output_dir': "./results",
        'overwrite_output_dir': True,
        'gradient_accumulation_steps': 2,
        'num_train_epochs': 3,
        'per_device_train_batch_size': 8,
        'per_device_eval_batch_size': 8,
        'eval_steps': 100,
        'save_steps': 100,
        'logging_steps': 500,
        'learning_rate': 5e-5,
        'evaluation_strategy': "steps"
    }

    config = FineTuneConfig(
        data_source="dialogues",
        cache_dir='./cache',
        tokenizer_model='gpt2-medium',
        pretrained_model_path='./gpt2-medium_v1',
        output_model_path='/v4',
        training_args=TRAINING_ARGS,
        question_column='prompt',
        answer_column='utterance',
        question_prefix='User: ',
        answer_prefix='AI Assistant: '
    )

    fine_tune_model(config)

Step,Training Loss,Validation Loss
100,No log,0.28982
200,No log,0.309457
300,No log,0.309678
400,No log,0.30755
500,0.288400,0.311852
600,0.288400,0.316129
700,0.288400,0.312529
800,0.288400,0.310996
900,0.288400,0.316362
1000,0.262900,0.321339


KeyboardInterrupt: ignored

# Create Subset of Natural Language Dataset

# Natural Questions Training

In [None]:
# Import necessary libraries
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
import torch

# Step 1: Load the Data
train_data = dataset['train'].to_pandas().sample(frac=0.2)
val_data = dataset['validation'].to_pandas().sample(frac=0.2)

# Step 2: Tokenize the Data
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')


# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

from torch.utils.data import Dataset

# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = item['input_ids']  # The labels are the same as the input IDs for a language modeling task
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


  # Tokenize the questions and answers
  questions = tokenizer(['<QUESTION>' + q for q in train_data['question'].tolist()], truncation=True, padding=True, return_tensors='pt', max_length=256)
  answers = tokenizer(['<ANSWER>' + a for a in train_data['answer'].tolist()], truncation=True, padding=True, return_tensors='pt', max_length=256)

  # Merge the tokenized questions and answers
  merged_encodings = {
      "input_ids": torch.cat([questions.input_ids, answers.input_ids], dim=-1),
      "attention_mask": torch.cat([questions.attention_mask, answers.attention_mask], dim=-1)
  }


  train_dataset = CustomDataset(merged_encodings)

  questions_val = tokenizer(['<QUESTION>' + q for q in val_data['question'].tolist()], truncation=True, padding=True, return_tensors='pt', max_length=256)
  answers_val = tokenizer(['<ANSWER>' + a for a in val_data['answer'].tolist()], truncation=True, padding=True, return_tensors='pt', max_length=256)

  merged_encodings_val = {
      "input_ids": torch.cat([questions_val.input_ids, answers_val.input_ids], dim=-1),
      "attention_mask": torch.cat([questions_val.attention_mask, answers_val.attention_mask], dim=-1)
  }

  val_dataset = CustomDataset(merged_encodings_val)

  print(torch.cuda.is_available())
model = GPT2LMHeadModel.from_pretrained('./v3')
model.to('cuda')

# Set the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,  # Adjust based on your needs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_steps=5,
    save_steps=5,
    save_total_limit=10,
    logging_steps=1,
    learning_rate=5e-5,
    evaluation_strategy="steps"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    #resume_from_checkpoint="./results/checkpoint-10/"
)

trainer.train()

model.save_pretrained("./model/v2")


True


Step,Training Loss,Validation Loss
5,1.5304,1.499707
10,1.3031,1.448882
15,1.4742,1.427788
20,1.8611,1.41019
25,1.0941,1.409755
30,1.4264,1.414559
35,1.4449,1.410813
40,1.3748,1.410528
45,1.4748,1.412757
50,1.3499,1.410209


# Prompt Training

In [None]:
# Import necessary libraries
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from datasets import load_dataset
import torch
from sklearn.model_selection import train_test_split

vai = True
if vai:


  dataset = pd.read_csv('/synthetic_data_v3_no_noise.csv')
  #dataset = dataset.sample(frac=0.2, random_state=42)
  train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

  tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') # check on gpt2-small with same training.

  tokenizer.pad_token = tokenizer.eos_token

  from torch.utils.data import Dataset

  # Custom dataset class
  class CustomDataset(Dataset):
      def __init__(self, encodings):
          self.encodings = encodings

      def __getitem__(self, idx):
          item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
          item['labels'] = item['input_ids']  
          return item

      # def __getitem__(self, idx):
      #     item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
      #     item['labels'] = item['input_ids']  # The labels are the same as the input IDs for a language modeling task
      #     return item

      def __len__(self):
          return len(self.encodings['input_ids'])



  questions = tokenizer(['<JOBTITLE>' + q for q in train_data['job_title'].tolist()], truncation=True, padding=True, return_tensors='pt', max_length=256)
  answers = tokenizer(['<PROMPT>' + a for a in train_data['prompt'].tolist()], truncation=True, padding=True, return_tensors='pt', max_length=256)

  # Merge the tokenized questions and answers
  merged_encodings = {
      "input_ids": torch.cat([questions.input_ids, answers.input_ids], dim=-1),
      "attention_mask": torch.cat([questions.attention_mask, answers.attention_mask], dim=-1)
  }

  # Create the custom dataset
  train_dataset = CustomDataset(merged_encodings)

  questions_val = tokenizer(['<JOBTITLE>' + q for q in val_data['job_title'].tolist()], truncation=True, padding=True, return_tensors='pt', max_length=256)
  answers_val = tokenizer(['<PROMPT>' + a for a in val_data['prompt'].tolist()], truncation=True, padding=True, return_tensors='pt', max_length=256)

  merged_encodings_val = {
      "input_ids": torch.cat([questions_val.input_ids, answers_val.input_ids], dim=-1),
      "attention_mask": torch.cat([questions_val.attention_mask, answers_val.attention_mask], dim=-1)
  }

  val_dataset = CustomDataset(merged_encodings_val)

  print(torch.cuda.is_available())
# Step 3: Initialize the Model and Training Configurations
model = GPT2LMHeadModel.from_pretrained('./v1')
model.to('cuda')

# Set the training arguments
training_args = TrainingArguments(
    output_dir="./results_condidate",
    overwrite_output_dir=True,
    num_train_epochs=4,  # Adjust based on your needs
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_steps=15,
    save_steps=15,
    save_total_limit=5,
    logging_steps=1,
    learning_rate=5e-5,
    evaluation_strategy="steps"
)

# Step 4: Fine-tune the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    #resume_from_checkpoint="./results/checkpoint-10/"
)

trainer.train()

# Step 5: Save the Fine-tuned Model
model.save_pretrained("./v3.1")


True


Step,Training Loss,Validation Loss
15,0.9021,0.833937
30,0.8552,0.807611
45,0.8482,0.786707
60,0.8828,0.772602
75,0.8137,0.764358
90,0.6838,0.776928
105,0.7058,0.76556
120,0.6883,0.761695
135,0.6699,0.769946
150,0.6767,0.74946


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
def generate_response(model, tokenizer, prompt):
    # Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)  # create attention mask of 1s

    # Generate a response
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=256, pad_token_id=tokenizer.eos_token_id, temperature=0.0, top_k=50, top_p=0.95, num_return_sequences=1)

    # Decode the output
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the content after the <ANSWER> token
    answer_start_idx = decoded_output.find('<ANSWER>') + len('<ANSWER>')
    return decoded_output[answer_start_idx:].strip()



# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token

# Load the gpt2-medium model
base_model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

# Load the fine-tuned model
model_path = "/model/v1"
fine_tuned_model = GPT2LMHeadModel.from_pretrained(model_path)

def print_output(prompt, base_model, fine_tuned_model):
    return """
    <h2>Input:  """ + prompt.replace('<QUESTION>', '').replace('<ANSWER>', '') + """</h2>
    <hr>

    <h2>Gpt2 Base Model</h2>

    <pre style="white-space: pre-wrap;">
    """ + base_response + """
    </pre>
    <hr>

    <h2>Gpt2 Fine-tuned Model</h2>

    <pre style="white-space: pre-wrap;">
    """ + fine_tuned_response + """
    </pre>
    <hr>
    """

from IPython.display import HTML
prompt = 'What is the capital of France?'\
#prompt = 'How can I make a chocolate cake?'
#prompt = 'Too legit, to legit to '
#prompt = 'Create a chocolate cake recipe' # INSTRUCT MODE
#prompt = 'Software Engineer'
#prompt = 'Party planner'
#prompt = 'List ten things a tourist can do in Venice, France.'
# Generate a prompt
prompt_formatted = f"<QUESTION>{prompt}<ANSWER>"

# Generate response using base model
base_response = generate_response(base_model, tokenizer, prompt_formatted)

# Generate response using fine-tuned model
fine_tuned_response = generate_response(fine_tuned_model, tokenizer, prompt_formatted)


HTML(print_output(prompt, base_model, fine_tuned_model))



Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
def generate_response_instruct(model, tokenizer, prompt):
    # Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)  # create attention mask of 1s

    # Generate a response
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=256, pad_token_id=tokenizer.eos_token_id, temperature=0.3, top_k=50, top_p=0.95, num_return_sequences=1)

    # Decode the output
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the content after the <ANSWER> token
    answer_start_idx = decoded_output.find('<PROMPT>') + len('<PROMPT>')
    return decoded_output[answer_start_idx:].strip()

base_model = None
fine_tuned_model = None
# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token

if base_model is None:
  # Load the gpt2-medium model
  base_model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

# Load the fine-tuned model
model_path = "./v3"
#model_path = "/content/results/checkpoint-1600"
if fine_tuned_model is None:
  fine_tuned_model = GPT2LMHeadModel.from_pretrained(model_path)

# Load the gpt2-medium model

fine_tuned_model = GPT2LMHeadModel.from_pretrained(model_path)

def print_output_instruct(job_title, fine_tuned_response, base_response=None):
    return """
    <h2>Input:  """ + job_title.replace('<JOBTITLE>', '').replace('<PROMPT>', '') + """</h2>
    <hr>

    <h2>Gpt2 Base Model</h2>

    <pre style="white-space: pre-wrap;">
     """ + base_response + """
    </pre>
    <hr>

    <h2>Gpt2 Fine-tuned Model</h2>

    <pre style="white-space: pre-wrap;">
    """ + fine_tuned_response + """
    </pre>
    <hr>
    """

from IPython.display import HTML

def generate_prompt(job_title,generate_base_model=False):

    prompt_formatted = f"<JOBTITLE>{job_title}<PROMPT>"

    base_response = ""
    if generate_base_model:
        # Generate response using base model
        base_response = generate_response_instruct(base_model, tokenizer, prompt_formatted)


    # Generate response using fine-tuned model    
    fine_tuned_response = generate_response_instruct(fine_tuned_model, tokenizer, prompt_formatted)


    return HTML(print_output_instruct(job_title,fine_tuned_response, base_response))

