In [1]:
!pip3 install datasets



## Login to HuggingFace

In [8]:
from huggingface_hub import login

login(new_session=False, # Won’t request token if one is already saved on machine
write_permission=True, # Requires a token with write permission
token= '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' , # The name of your token
add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/solo/.cache/huggingface/token
Login successful


## Load the dataset

In [9]:
from datasets import load_dataset

In [10]:
data_files = "cot_fs_opt_train.jsonl"
flanv2_cot = load_dataset("json", data_files=data_files, split="train")
flanv2_cot

Dataset({
    features: ['inputs', 'targets', 'task'],
    num_rows: 149490
})

In [11]:
import pandas as pd
flanv2_cot = pd.DataFrame(flanv2_cot[:])
flanv2_cot.head()

Unnamed: 0,inputs,targets,task
0,Q: Test for natural language inference. Premis...,A man must be outside to be bending down to lo...,cot
1,"Q: If ""A man and a little girl are sitting on ...",A man either walks down outside steps or is fl...,cot
2,q: What do you do to someone easily when you l...,Homeowner had a displeasure against the price ...,cot
3,"QUESTION: People have to eat and drink, breath...",Teachers usually teach at educational institut...,cot
4,Test for natural language inference. Premise: ...,A girl can either wear a black shirt or blue s...,cot


In [12]:
flanv2_cot_small = flanv2_cot.drop(['task'],axis = 1)

In [13]:
flanv2_cot_small.head(3)

Unnamed: 0,inputs,targets
0,Q: Test for natural language inference. Premis...,A man must be outside to be bending down to lo...
1,"Q: If ""A man and a little girl are sitting on ...",A man either walks down outside steps or is fl...
2,q: What do you do to someone easily when you l...,Homeowner had a displeasure against the price ...


In [14]:
flanv2_cot_small.shape

(149490, 2)

## Keeping only inputs with target token length more than 100

In [15]:
# Function to count tokens
def count_tokens(text):
    return len(text.split())

# Filter the DataFrame
flav2_small_filtered = flanv2_cot_small[flanv2_cot_small['targets'].apply(count_tokens) >= 100]


In [18]:
flav2_small_filtered = flav2_small_filtered[0:899]
flav2_small_filtered.shape

(899, 2)

## Deduplicating using cosine similarity with a score more than 95%

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to deduplicate responses
def deduplicate_responses(df, threshold=0.95):
    vectorizer = TfidfVectorizer().fit_transform(flav2_small_filtered['targets'])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)
    
    indices_to_remove = set()
    for i in range(len(cosine_sim)):
        for j in range(i+1, len(cosine_sim)):
            if cosine_sim[i, j] > threshold:
                indices_to_remove.add(j)
    
    return flav2_small_filtered.drop(flav2_small_filtered.index[list(indices_to_remove)])

deduplicated_flanv2 = deduplicate_responses(flav2_small_filtered)


## Saving into a CSV file

In [20]:
# Save the deduplicated dataset to a CSV file
deduplicated_flanv2.to_csv("flan_v2_cot_100tokens_deduplicated_v2.csv", index=False)

## Pushing my new dataset to the hub

In [21]:
from datasets import load_dataset, Dataset, DatasetDict

# Load the dataset
dataset = load_dataset('csv', data_files='flan_v2_cot_100tokens_deduplicated_v2.csv')

# Push the dataset to Hugging Face
dataset.push_to_hub("abag91/flan_v2_cot_100tokens_deduplicated_v2")


Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/abag91/flan_v2_cot_100tokens_deduplicated_v2/commit/f49e56407cd98a9cd02f58660b68a516b2161f99', commit_message='Upload dataset', commit_description='', oid='f49e56407cd98a9cd02f58660b68a516b2161f99', pr_url=None, pr_revision=None, pr_num=None)

#### Link to my new [dataset](https://huggingface.co/datasets/abag91/flan_v2_cot_100tokens_deduplicated).

In [66]:
!pip3 install torch

Collecting torch
  Downloading torch-2.2.2-cp38-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Collecting sympy (from torch)
  Using cached sympy-1.12.1-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.1-py3-none-any.whl.metadata (5.3 kB)
Collecting mpmath<1.4.0,>=1.1.0 (from sympy->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.2.2-cp38-none-macosx_10_9_x86_64.whl (150.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.6/150.6 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading networkx-3.1-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached sympy-1.12.1-py3-none-any.whl (5.7 MB)
Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Installing collected packages: mpmath, sympy, networkx, torch
Successfully installed mpmath-1.3.0 

## Now we will finetune the dataset using LLAMA2

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

# Load the dataset from Hugging Face
dataset = load_dataset("abag91/flan_v2_cot_100tokens_deduplicated")

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['instruction'], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
)

# Train the model
trainer.train()

## Save the trained model

In [7]:
output_dir = "./fine_tuned_model"
trainer.model.save_pretrained(output_dir)
trainer.tokenizer.save_pretrained(output_dir)

## Evaluate the model performance

In [None]:
# Load the fine-tuned model
model_name = "path_to_your_fine_tuned_model"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load evaluation dataset (example: using a Hugging Face dataset)
eval_dataset = flanv2_cot_small[300:500]

In [None]:
from datasets import load_metric

# Load the metric(s)
bleu = load_metric("bleu")
rouge = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute BLEU
    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    
    # Compute ROUGE
    rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    
    return {"bleu": bleu_score, "rouge": rouge_score}
