# Utils

In [None]:
!nvidia-smi

# Install requirements

Run this if you are using a Kaggle notebook (pay attention to the version of torch!).

In [None]:
%%capture
%pip install -U pip
%pip install torch==2.3.1
%pip install transformers==4.42.3
%pip install datasets==2.20.0
%pip install accelerate==0.31.0
%pip install colored==2.2.4
%pip install openpyxl==3.1.5
%pip install matplotlib==3.9.1
%pip install scikit-learn==1.5.1
%pip install seaborn==0.13.2
%pip install tensorboard==2.17.0
%pip install bitsandbytes==0.43.1
%pip install peft==0.11.1
%pip install trl==0.9.4

# Load model and tokenizer

Login to Hugging Face (this is required to download the fine-tuned model).

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import torch
from transformers import (
    BitsAndBytesConfig,
    AutoTokenizer,
    AutoModelForCausalLM
) 

model_path = "DG266/Llama-3-8B-Instruct-Refair-FAIRWAY"

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast = True)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    #attn_implementation = "flash_attention_2",
    device_map = "auto"
)

# Generate User Stories

We will consider all 34 domains.

In [None]:
domains = ["Biology", "Cardiology", "Computer Networks", "Computer Vision", "Demography", "Dermatology", "Economics", "Education", "Endocrinology", "Finance & Marketing", "Health", "Information Systems", "Law", "Library", "Linguistics", "Literature", "Medicine", "Movies", "Music", "Nephrology", "News", "Pediatrics", "Pharmacology", "Plant Science", "Political Science", "Psychology", "Radiology", "Social Media", "Social Networks", "Social Work", "Sociology", "Sport", "Transportation", "Urban Studies"]
len(domains)

...and these 20 machine learning tasks.

In [None]:
tasks = ["adversarial learning", "cnn", "conversational agent", "decision tree", "document classification", "entity extraction", "feature selection", "imbalanced dataset", "keyword extraction", "k-nearest neighbor", "multi-label classification", "neural network", "random forest", "semantic similarity", "sentiment analysis", "speech to text", "text categorization", "unsupervised clustering", "voice recognition", "word embedding"]
len(tasks)

Setup text generation.

In [None]:
from transformers import pipeline

pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    max_new_tokens = 128,
    return_full_text = False
)

In [None]:
def create_prompt(task, domain):
    system_message = "You are a helpful AI assistant"
    user_message = f"Considering the following machine learning technique: {task} in the field of machine learning. "\
                   f"Can you provide me with a specific user story for the following application domain? {domain}"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]
    return tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)

Let's generate user stories.

In [None]:
rows = []
for domain in domains:
    if domain in ["Biology", "Plant Science"]:
        cluster = "Biology & Botanic"
        topic = 1
    elif domain in ["Economics", "Finance & Marketing"]:
        cluster = "Economy & Marketing"
        topic = 2
    elif domain in ["Information Systems", "News"]:
        cluster = "Information Systems & News"
        topic = 3
    elif domain in ["Law", "Political Science"]:
        cluster = "Law & Politics"
        topic = 4
    elif domain in ["Library", "Linguistics", "Literature"]:
        cluster = "Literature & Linguistics"
        topic = 5
    elif domain in ["Cardiology", "Dermatology", "Endocrinology", "Health", "Medicine", "Nephrology", "Pediatrics", "Pharmacology", "Psychology", "Radiology"]:
        cluster = "Medicine & Health"
        topic = 6
    elif domain in ["Demography", "Education", "Social Media", "Social Networks", "Social Work", "Sociology", "Transportation", "Urban Studies"]:
        cluster = "Social and Urban Studies"
        topic = 7
    elif domain in ["Movies", "Music", "Sport"]:
        cluster = "Sport & Entertainment"
        topic = 8
    elif domain in ["Computer Networks", "Computer Vision"]:
        cluster = "Technical Domains"
        topic = 9

    for task in tasks:
        prompt = create_prompt(task, domain)
        outputs = pipe(prompt)
        rows.append(
            {
                "Domain Cluster": cluster,
                "Topic": topic,
                "Domain": domain,
                "Machine Learning Task": task,
                "User Story": outputs[0]["generated_text"],
            }
        )

In [None]:
import pandas as pd

generated_user_stories_df = pd.DataFrame(rows)

In [None]:
generated_user_stories_df.head()

Save everything in a .csv file.

In [None]:
generated_user_stories_df.to_csv("llama3_finetuned_user_stories.csv", index = None)