<a href="https://colab.research.google.com/github/DylanBingham/fine_tuning_qa_ml_expert/blob/feat%2Ffine_tuned_model_1000_evaluation/LlamaFineTunedModel1000_ForComparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U bitsandbytes
!pip install trl
!pip install -U transformers
!pip install accelerate
!pip install numpy
!pip install torch



In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:

from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoModelForCausalLM
)
import torch

# Since the 8B model is so big it is recommended to quantize the model to a lower precision
# Set load_in_4bit to false if no CUDA enabled GPU
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained("dtbingh24/fine_tuning_ml_expert", use_fast=True)

# Load the llama3 model for causal language modeling
model = AutoModelForCausalLM.from_pretrained(
    "dtbingh24/fine_tuning_ml_expert",
    quantization_config=quantization_config,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/55.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [3]:
import pandas as pd

# Assuming the file is a Google Sheet accessible via a link or stored locally
# Replace with your actual file path or Google Sheets URL
file_path_or_url = "/content/drive/MyDrive/Colab Notebooks/data/LLM Dataset - v2.xlsx"

try:
    # Try to read the sheet using pandas
    df = pd.read_excel(file_path_or_url, sheet_name="Final Dataset -  6170 data pts")
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found at {file_path_or_url}. Please check the file path.")
except ValueError:
    print(f"Error: Sheet 'Final Dataset - 6170 data pts' not found in the file. Please check the sheet name.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")


# Now you can work with the DataFrame 'df'
# Example: print the first 5 rows
#print(df.head())

Dataset loaded successfully.


In [4]:
from sklearn.model_selection import train_test_split

# Assuming 'df' is your DataFrame
train_df, test_df = train_test_split(df, test_size=0.1, random_state=24)

print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")

Train set size: 5552
Test set size: 617


## Begin Testing


In [8]:
from transformers import pipeline
def create_test_prompt(row):
    prompt = dedent(
        f"""
        {row['Question']}
        """
    )
    messages = [
        # the system prompt is very important to adjust the control the behavior of the model, make sure to use properly accoring to your task
        {"role": "system", "content": "You're a domain expert in machine learning and artificial intelligence, answer any questions about these\
         topics as accurately and concisely as possible. You MUST ALSO, no matter what, keep your responses to 300 tokens or less."},
        {"role": "user", "content": prompt},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

pipe = pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=300,
    return_full_text=False
)

In [9]:
import pandas as pd
from textwrap import dedent

def generate_llama_responses(test_df, pipe, batch_size=8):
    results = []
    prompts = test_df['Question'].tolist()
    for i in range(0, len(prompts), batch_size):
        batch_prompts = prompts[i:i+batch_size]
        batch_inputs = [create_test_prompt({'Question': q}) for q in batch_prompts]
        batch_results = pipe(batch_inputs)
        print(f"On prompt number {i} out of {len(prompts)}")
        for question, output in zip(batch_prompts, batch_results):
            generated_text = output[0]['generated_text'] if isinstance(output, list) else output['generated_text']
            results.append({"question": question, "response": generated_text})
        if i % 100 == 0:
          results_df = pd.DataFrame(results)
          results_df.to_csv(f"/content/drive/MyDrive/Colab Notebooks/data/llama_fine_tune_1000_responses_{i}.csv", index=False)
    return results


results = generate_llama_responses(test_df, pipe)
results_df = pd.DataFrame(results)
results_df.to_csv("/content/drive/MyDrive/Colab Notebooks/data/llama_fine_tune_1000_responses.csv", index=False)

On prompt number 0 out of 617
On prompt number 8 out of 617
On prompt number 16 out of 617
On prompt number 24 out of 617
On prompt number 32 out of 617
On prompt number 40 out of 617
On prompt number 48 out of 617
On prompt number 56 out of 617
On prompt number 64 out of 617


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


On prompt number 72 out of 617
On prompt number 80 out of 617
On prompt number 88 out of 617
On prompt number 96 out of 617
On prompt number 104 out of 617
On prompt number 112 out of 617
On prompt number 120 out of 617
On prompt number 128 out of 617
On prompt number 136 out of 617
On prompt number 144 out of 617
On prompt number 152 out of 617
On prompt number 160 out of 617
On prompt number 168 out of 617
On prompt number 176 out of 617
On prompt number 184 out of 617
On prompt number 192 out of 617
On prompt number 200 out of 617
On prompt number 208 out of 617
On prompt number 216 out of 617
On prompt number 224 out of 617
On prompt number 232 out of 617
On prompt number 240 out of 617
On prompt number 248 out of 617
On prompt number 256 out of 617
On prompt number 264 out of 617
On prompt number 272 out of 617
On prompt number 280 out of 617
On prompt number 288 out of 617
On prompt number 296 out of 617
On prompt number 304 out of 617
On prompt number 312 out of 617
On prompt nu

# Evaluation of Responses

In [5]:
results_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/llama_fine_tune_1000_responses.csv")

In [6]:
# prompt: I need to compute the cosine similarity between the responses generated by my model and the responses from the original dataset

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'results_df' contains the model's responses and 'df' has the original responses
# and both dataframes have a common column like 'question' to link them
merged_df = pd.merge(results_df, test_df, right_on='Question', left_on='question')

# Create TF-IDF vectors
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(merged_df['response'].tolist() + merged_df['Answer'].tolist())

# Calculate cosine similarity
model_vectors = vectors[:len(merged_df)]
original_vectors = vectors[len(merged_df):]
cosine_similarities = cosine_similarity(model_vectors, original_vectors)

# Add cosine similarity to the DataFrame
merged_df['cosine_similarity'] = cosine_similarities.diagonal()

# Print or save the results
print(merged_df[['Question', 'response', 'Answer', 'cosine_similarity']])
#merged_df.to_csv("/content/drive/MyDrive/Colab Notebooks/data/llama_fine_tune_cosine_similarity.csv", index=False)

                                              Question  \
0    How does a confidence interval relate to hypot...   
1    How does random forests determine the importan...   
2                 What is a ReLU activation function?    
3              What is self-attention in transformers?   
4    How does k-means clustering work in prototype ...   
..                                                 ...   
632      What is precision in classification metrics?    
633                What is the Laplace approximation?    
634                    What is Rademacher complexity?    
635  How does the bias-variance tradeoff impact mod...   
636  What are 'weak learners' in the context of boo...   

                                              response  \
0    A confidence interval provides a range of valu...   
1    Random forests determine variable importance b...   
2    ReLU stands for Rectified Linear Unit, which o...   
3    Self-attention allows the model to weigh the i...   
4    K-means 

In [11]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Replace with the path to your Hugging Face model/tokenizer
model_name = "dtbingh24/fine_tuning_ml_expert"

# Load your fine-tuned tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)


# Assuming 'results_df' contains the model's responses and 'test_df' has the original responses
# and both dataframes have a common column like 'question' to link them
# merged_df = pd.merge(results_df, test_df, right_on='Question', left_on='question')

def get_embeddings(text_list):
    # Tokenize and get embeddings
    inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Extract the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
    return embeddings

# Get embeddings for model responses and original responses
model_embeddings = get_embeddings(merged_df['response'].tolist())
original_embeddings = get_embeddings(merged_df['Answer'].tolist())

# Calculate cosine similarity
cosine_similarities = cosine_similarity(model_embeddings.numpy(), original_embeddings.numpy())

# Add cosine similarity to the DataFrame
merged_df['cosine_similarity'] = cosine_similarities.diagonal()

# Print or save the results
print(merged_df[['Question', 'response', 'Answer', 'cosine_similarity']])
# merged_df.to_csv("/content/drive/MyDrive/Colab Notebooks/data/llama_fine_tune_cosine_similarity.csv", index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


NameError: name 'model' is not defined