# Installing dependencies

In [1]:
# we use the latest version of transformers, peft, and accelerate
!pip install -q accelerate peft transformers

# install bitsandbytes for quantization
!pip install -q bitsandbytes

# install trl for the SFT library
!pip install -q trl

# we need sentencepiece for the llama2 slow tokenizer
!pip install sentencepiece

# we need einops, used by falcon-7b, llama-2 etc
# einops (einsteinops) is used to simplify tensorops by making them readable
!pip install -q -U einops

# we need to install datasets for our training dataset
!pip install -q datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m

# Loading finetuned models and test dataset

# Testing the fine tuned model Mistral7b

Inference with mistral7b model finetuned on full dataset

In [2]:
from tqdm import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
# Specify the model and tokenizer name from the Hugging Face model hub
model_name = "ayman56/mistral7b_finetuned_full_stackoverflow_test"
tokenizer_name  = "ayman56/mistral7b_finetuned_full_stackoverflow_test"
# load the quantized settings, we're doing 4 bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # use the gpu
    device_map={"": 0}
)

# don't use the cache
model.config.use_cache = False

# Load the tokenizer from the model (mistral)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


2024-04-13 19:58:48.022422: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-13 19:58:48.084905: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [12]:
df.question.count()

3296

In [21]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm  # Import tqdm for progress tracking

# Define a function to perform inference on a single question
def infer_question(prompt):
    # Run text generation pipeline with your model
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)
    result = pipe(f"[INST] {prompt} [/INST]")
    return result[0]['generated_text']

# Load the test set from the CSV file
test_set = pd.read_csv('test_set.csv')

# Initialize lists to collect generated prompts, original questions, and original answers
generated_prompts = []
original_questions = []
original_answers = []

# Iterate over each row in the test set with tqdm to track progress
for index, row in tqdm(test_set.iterrows(), total=len(test_set), desc="Generating Prompts"):
    question = row['question']
    answer = row['answer']
    
    # Generate a prompt using the inference function
    generated_prompt = infer_question(question)
    
    # Collect the generated prompt, original question, and original answer
    generated_prompts.append(generated_prompt)
    original_questions.append(question)
    original_answers.append(answer)

# Create a DataFrame to store the generated prompts, original questions, and original answers
generated_prompts_dataset = pd.DataFrame({
    'Generated_Prompt': generated_prompts,
    'Original_Question': original_questions,
    'Original_Answer': original_answers
})

# Save the generated prompts dataset to a CSV file
generated_prompts_dataset.to_csv('generated_Mistral7b_finetuned.csv', index=False)


Generating Prompts:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   1%|          | 1/100 [00:07<12:06,  7.34s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   2%|▏         | 2/100 [00:21<18:58, 11.62s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   3%|▎         | 3/100 [00:59<38:01, 23.52s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   4%|▍         | 4/100 [01:30<42:07, 26.33s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   5%|▌         | 5/100 [01:58<42:38, 26.93s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   6%|▌         | 6/100 [02:32<45:59, 29.35s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   7%|▋         | 7/100 [03:03<46:21, 29.91s

In [29]:
generated_prompts_dataset.Generated_Prompt.iloc[1]

'[INST] I\'m looking for a library in Python which will provide at and cron like functionality.\nI\'d quite like have a pure Python solution, rather than relying on tools installed on the box; this way I run on machines with no cron.\nFor those unfamiliar with cron: you can schedule tasks based upon an expression like: \n 0 2 * * 7 /usr/bin/run-backup # run the backups at 0200 on Every Sunday\n 0 9-17/2 * * 1-5 /usr/bin/purge-temps # run the purge temps command, every 2 hours between 9am and 5pm on Mondays to Fridays.\n\nThe cron time expression syntax is less important, but I would like to have something with this sort of flexibility. \nIf there isn\'t something that does this for me out-the-box, any suggestions for the building blocks to make something like this would be gratefully received.\nEdit\nI\'m not interested in launching processes, just "jobs" also written in Python - python functions. By necessity I think this would be a different thread, but not in a different process.\nT

Inference with mistral7b model finetuned on small dataset

In [27]:
from tqdm import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
# Specify the model and tokenizer name from the Hugging Face model hub
model_name = "ayman56/mistral7b_finetuned_60_stackoverflow_test"
tokenizer_name  = "ayman56/mistral7b_finetuned_60_stackoverflow_test"
# load the quantized settings, we're doing 4 bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # use the gpu
    device_map={"": 0}
)

# don't use the cache
model.config.use_cache = False

# Load the tokenizer from the model (mistral)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [28]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm  # Import tqdm for progress tracking

# Define a function to perform inference on a single question
def infer_question(prompt):
    # Run text generation pipeline with your model
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1000)
    result = pipe(f"[INST] {prompt} [/INST]")
    return result[0]['generated_text']

# Load the test set from the CSV file
test_set = pd.read_csv('test_set.csv')

# Initialize lists to collect generated prompts, original questions, and original answers
generated_prompts = []
original_questions = []
original_answers = []

# Iterate over each row in the test set with tqdm to track progress
for index, row in tqdm(test_set.iterrows(), total=len(test_set), desc="Generating Prompts"):
    question = row['question']
    answer = row['answer']
    
    # Generate a prompt using the inference function
    generated_prompt = infer_question(question)
    
    # Collect the generated prompt, original question, and original answer
    generated_prompts.append(generated_prompt)
    original_questions.append(question)
    original_answers.append(answer)

# Create a DataFrame to store the generated prompts, original questions, and original answers
generated_prompts_dataset = pd.DataFrame({
    'Generated_Prompt': generated_prompts,
    'Original_Question': original_questions,
    'Original_Answer': original_answers
})

# Save the generated prompts dataset to a CSV file
generated_prompts_dataset.to_csv('generated_prompts_finetune60_dataset.csv', index=False)


Generating Prompts:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   1%|          | 1/100 [00:38<1:03:43, 38.62s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   2%|▏         | 2/100 [01:23<1:08:49, 42.14s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   3%|▎         | 3/100 [02:26<1:23:50, 51.86s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   4%|▍         | 4/100 [03:27<1:28:33, 55.35s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   5%|▌         | 5/100 [04:28<1:30:59, 57.47s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   6%|▌         | 6/100 [05:32<1:33:21, 59.59s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   7%|▋         | 7/100 [06:35<1

# Testing the fine tuned model LLAMA2

In [10]:
from tqdm import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
# Specify the model and tokenizer name from the Hugging Face model hub
model_name = "ayman56/llama2_finetuned_full_stackoverflow_test"
tokenizer_name  = "ayman56/llama2_finetuned_full_stackoverflow_test"
# load the quantized settings, we're doing 4 bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # use the gpu
    device_map={"": 0}
)

# don't use the cache
model.config.use_cache = False

# Load the tokenizer from the model (mistral)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [14]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm  # Import tqdm for progress tracking

# Define a function to perform inference on a single question
def infer_question(prompt):
    # Run text generation pipeline with your model
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1000)
    result = pipe(f" {prompt} ")
    return result[0]['generated_text']

# Load the test set from the CSV file
test_set = pd.read_csv('test_set.csv')

# Initialize lists to collect generated prompts, original questions, and original answers
generated_prompts = []
original_questions = []
original_answers = []

# Iterate over each row in the test set with tqdm to track progress
for index, row in tqdm(test_set.iterrows(), total=len(test_set), desc="Generating Prompts"):
    question = row['question']
    answer = row['answer']
    
    # Generate a prompt using the inference function
    generated_prompt = infer_question(question)
    
    # Collect the generated prompt, original question, and original answer
    generated_prompts.append(generated_prompt)
    original_questions.append(question)
    original_answers.append(answer)

# Create a DataFrame to store the generated prompts, original questions, and original answers
generated_prompts_dataset = pd.DataFrame({
    'Generated_Prompt': generated_prompts,
    'Original_Question': original_questions,
    'Original_Answer': original_answers
})

# Save the generated prompts dataset to a CSV file
generated_prompts_dataset.to_csv('generated_prompts_Llama2_dataset.csv', index=False)


Generating Prompts: 100%|██████████| 100/100 [1:20:28<00:00, 48.28s/it]


# Inference of the base model Mistral 7b

In [5]:
from tqdm import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
# Specify the model and tokenizer name from the Hugging Face model hub
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer_name  = "mistralai/Mistral-7B-Instruct-v0.2"
# load the quantized settings, we're doing 4 bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # use the gpu
    device_map={"": 0}
)

# don't use the cache
model.config.use_cache = False

# Load the tokenizer from the model (mistral)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm  # Import tqdm for progress tracking

# Define a function to perform inference on a single question
def infer_question(prompt):
    # Run text generation pipeline with your model
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)
    result = pipe(f"[INST] {prompt} [/INST]")
    return result[0]['generated_text']

# Load the test set from the CSV file
test_set = pd.read_csv('test_set.csv')

# Initialize lists to collect generated prompts, original questions, and original answers
generated_prompts = []
original_questions = []
original_answers = []

# Iterate over each row in the test set with tqdm to track progress
for index, row in tqdm(test_set.iterrows(), total=len(test_set), desc="Generating Prompts"):
    question = row['question']
    answer = row['answer']
    
    # Generate a prompt using the inference function
    generated_prompt = infer_question(question)
    
    # Collect the generated prompt, original question, and original answer
    generated_prompts.append(generated_prompt)
    original_questions.append(question)
    original_answers.append(answer)

# Create a DataFrame to store the generated prompts, original questions, and original answers
generated_prompts_dataset = pd.DataFrame({
    'Generated_Prompt': generated_prompts,
    'Original_Question': original_questions,
    'Original_Answer': original_answers
})

# Save the generated prompts dataset to a CSV file
generated_prompts_dataset.to_csv('generated_prompts_baseMistral_7b.csv', index=False)


Generating Prompts:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   1%|          | 1/100 [00:06<10:18,  6.25s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   2%|▏         | 2/100 [00:15<12:44,  7.80s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   3%|▎         | 3/100 [00:30<18:19, 11.34s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   4%|▍         | 4/100 [00:46<20:45, 12.97s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   5%|▌         | 5/100 [00:58<20:27, 12.92s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   6%|▌         | 6/100 [01:14<21:44, 13.88s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating Prompts:   7%|▋         | 7/100 [01:26<20:17, 13.09s

All the generated scirpts are saved and will be used for evaluation in Evaluation notebooks