In [1]:
%%capture

# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install torch==2.3.0+cu121 torchvision==0.18.0+cu121 torchaudio==2.3.0+cu121 -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
%%capture
!pip install mlflow pyngrok

In [3]:
import mlflow

# Set up MLflow experiment (optional, if not already set)
mlflow.set_tracking_uri("file:///kaggle/working/mlruns")  # Saves runs in /kaggle/working/mlruns
mlflow.set_experiment("Classification_Finetuining_Experiment")
mlflow.start_run(run_name="Llama-3.2") # write here your model name


2024/11/17 18:18:08 INFO mlflow.tracking.fluent: Experiment with name 'Classification_Finetuining_Experiment' does not exist. Creating a new experiment.


<ActiveRun: >

In [4]:
mlflow.end_run()

In [5]:
from pyngrok import ngrok
ngrok.set_auth_token("2orh5SJMd1vIGXN23P3WVlO0Dqx_6N7mnS8ZbXGsgFbwuFtv5")

# Start MLflow UI
get_ipython().system_raw("mlflow ui --port 5000 &")
# Expose the MLflow UI on port 5000# Expose the MLflow UI on port 5000
public_url = ngrok.connect("5000", "http")
print(f"MLflow UI accessible at: {public_url}")


MLflow UI accessible at: NgrokTunnel: "https://77da-34-168-186-70.ngrok-free.app" -> "http://localhost:5000"


In [6]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-bnb-4bit",
    max_seq_length # Add LoRA adapters so we only need to update 1 to 10% of all parameters!
= max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


[2024-11-17 18:18:13 +0000] [188] [INFO] Starting gunicorn 23.0.0
[2024-11-17 18:18:13 +0000] [188] [INFO] Listening at: http://127.0.0.1:5000 (188)
[2024-11-17 18:18:13 +0000] [188] [INFO] Using worker: sync
[2024-11-17 18:18:13 +0000] [190] [INFO] Booting worker with pid: 190
[2024-11-17 18:18:13 +0000] [191] [INFO] Booting worker with pid: 191
[2024-11-17 18:18:13 +0000] [192] [INFO] Booting worker with pid: 192
[2024-11-17 18:18:13 +0000] [193] [INFO] Booting worker with pid: 193


==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

# Add LoRA adapters so we only need to update 1 to 10% of all parameters!


In [7]:
# Create a dictionary of parameters to log
model_params = {
    "Llama_model_name": "unsloth/Llama-3.2-3B-bnb-4bit",
    "Llama_max_seq_length": max_seq_length,
    "Llama_dtype": dtype,
    "Llama_load_in_4bit": load_in_4bit,
}

# Log all parameters at once
mlflow.log_params(model_params)

In [8]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj","lm_head"],
    lora_alpha = 16,
    lora_dropout = 0.1, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = True, # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.11.7 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Unsloth: Training lm_head in mixed precision to save VRAM


In [9]:
tokenizer.padding_side = 'right'
tokenizer.add_eos_token = True
tokenizer.pad_token = tokenizer.eos_token

In [10]:
model_params = {
    "LoRA_r": 64,
    "LoRA_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"],
    "LoRA_lora_alpha": 16,
    "LoRA_lora_dropout": 0.1,
    "LoRA_bias": "none",
    "LoRA_use_gradient_checkpointing": True,
    "LoRA_random_state": 3407,
    "LoRA_use_rslora": False,
    "LoRA_loftq_config": None,
}

# Log model parameters
mlflow.log_params(model_params)

# Define tokenizer settings for logging
tokenizer_settings = {
    "tokenizer_padding_side": "right",
    "tokenizer_add_eos_token": True,
    "tokenizerpad_token": tokenizer.eos_token,
}

# Log tokenizer settings
mlflow.log_params(tokenizer_settings)

# Data Preparation

In [11]:
import pandas as pd
classification_train = pd.read_csv('/kaggle/input/train-dataset/Classification_train.csv')

# Take a 10% sample for the test dataset
classification_test = classification_train.sample(frac=0.1, random_state=42)

# Drop the sampled rows from the original dataset to get the remaining 90%
classification_train = classification_train.drop(classification_test.index)

# Save the test and updated train datasets
classification_test.to_csv('classification_test.csv', index=False)
classification_train.to_csv('classification_train.csv', index=False)

print("10% sample saved as classification_test.csv and removed from classification_train.csv")

10% sample saved as classification_test.csv and removed from classification_train.csv


In [12]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files='/kaggle/working/classification_train.csv', split='train')
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['dataset_name', 'series_description', 'algorithm', 'hyperparameters'],
    num_rows: 727
})

In [13]:
train_valid_split = dataset.train_test_split(test_size=0.1)

In [14]:
train_dataset = train_valid_split['train']
valid_dataset = train_valid_split['test']

In [15]:
EOS_TOKEN = tokenizer.eos_token

train_prompt = """Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.
The best algorithm name should be one of this search space algorithms: AdaboostRegressor, ElasticNetRegressor,  ExtraTreesRegressor,  LassoRegressor,  LightgbmRegressor, SVR, GaussianProcessRegressor, RandomForestRegressor or  XGBoostRegressor.

### DESCRIPTION:
{}

### RESPONSE:
{}"""



def formatting_prompts_func(examples):
    inputs       = examples["series_description"]
    outputs      = examples["algorithm"]
    texts = []
    for input, output in zip( inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = train_prompt.format( input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }


def formatting_test_prompts_func(examples):
    inputs = examples["series_description"]
    texts = []

    for input in inputs:
        text = test_prompt.format(input) 
        texts.append(text)

    return { "text": texts }


In [16]:
train_dataset = train_dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/654 [00:00<?, ? examples/s]

In [17]:
train_dataset

Dataset({
    features: ['dataset_name', 'series_description', 'algorithm', 'hyperparameters', 'text'],
    num_rows: 654
})

In [18]:
valid_dataset = valid_dataset.map(formatting_prompts_func, batched = True)
valid_dataset

Map:   0%|          | 0/73 [00:00<?, ? examples/s]

Dataset({
    features: ['dataset_name', 'series_description', 'algorithm', 'hyperparameters', 'text'],
    num_rows: 73
})

In [19]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

training_arguments= TrainingArguments(
        num_train_epochs=1,
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 32//4,
        gradient_checkpointing=True,
        warmup_steps = 5,
        max_steps = -1, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "./mistral_outputs",
        evaluation_strategy="steps", #epoch
        save_strategy="epoch",
    )



In [20]:
from trl import  DataCollatorForCompletionOnlyLM

instruction_template="DESCRIPTION:"
response_template = "RESPONSE:"

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_arguments,
    # data_collator =  DataCollatorForCompletionOnlyLM(instruction_template=instruction_template,
    #                                                  response_template=response_template,
    #                                                  tokenizer=tokenizer,mlm=False),

)

Map (num_proc=2):   0%|          | 0/654 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/73 [00:00<?, ? examples/s]

In [21]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.145 GB of memory reserved.


In [22]:
import time
start= time.time()
trainer_stats = trainer.train()
print((time.time()-start)/60)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 654 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 20
 "-____-"     Number of trainable parameters = 491,257,856
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113604322220756, max=1.0…

Step,Training Loss,Validation Loss
1,2.4671,2.461712
2,2.4489,2.368804
3,2.3721,2.167445
4,2.1486,1.898919
5,1.895,1.61252
6,1.5959,1.327621
7,1.3312,1.102626
8,1.0672,0.917982
9,0.8919,0.788918
10,0.7816,0.674701


18.153227162361144


In [23]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)# save tuned model

#To save the final model as LoRA adapters
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1086.3193 seconds used for training.
18.11 minutes used for training.
Peak reserved memory = 12.436 GB.
Peak reserved memory for training = 7.291 GB.
Peak reserved memory % of max memory = 84.363 %.
Peak reserved memory for training % of max memory = 49.461 %.


In [29]:
test_prompt = """Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.

The best algorithm name should be one of this search space algorithms: AdaboostClassifier, ElasticNetClassifier,  LassoClassifier,  LightgbmClassifier, SVC, GaussianProcessClassifier, RandomForestClassifier or  XGBoostClassifier.



### DESCRIPTION:

{}



### RESPONSE:

"""


valid_dataset = test_datasetalid_dataset.map(formatting_test_prompts_func, batched = True)

valid_dataset

Map:   0%|          | 0/73 [00:00<?, ? examples/s]

Dataset({
    features: ['dataset_name', 'series_description', 'algorithm', 'hyperparameters', 'text'],
    num_rows: 73
})

In [31]:
valid_dataset['text'][0]

'Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.\n\nThe best algorithm name should be one of this search space algorithms: AdaboostClassifier, ElasticNetClassifier,  LassoClassifier,  LightgbmClassifier, SVC, GaussianProcessClassifier, RandomForestClassifier or  XGBoostClassifier.\n\n\n\n### DESCRIPTION:\n\nA multivariate classification time-series dataset consists of 7121 samples and 16 features with 16 numerical and 0 categorical features. Each instance has a window length of 24. The dataset has a sampling rate of 60.0 minutes. The dataset has a missing values percentage of 0.0%. The missing values percentages for numerical features range from 0 to 0 with mean 0.00 and standard deviation 0.00.\n The target column has 3 classes with entropy value 1.37 showing a Unbalanced dataset. Among the 7121 samples the target ground-truth class has changed 1162 times representing a 

In [33]:
valid_dataset['algorithm'][0]

'AdaboostClassifier'

In [34]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

inputs = tokenizer(

[valid_dataset['text'][0]], return_tensors = "pt").to("cuda")


outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)

tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.\n\nThe best algorithm name should be one of this search space algorithms: AdaboostClassifier, ElasticNetClassifier,  LassoClassifier,  LightgbmClassifier, SVC, GaussianProcessClassifier, RandomForestClassifier or  XGBoostClassifier.\n\n\n\n### DESCRIPTION:\n\nA multivariate classification time-series dataset consists of 7121 samples and 16 features with 16 numerical and 0 categorical features. Each instance has a window length of 24. The dataset has a sampling rate of 60.0 minutes. The dataset has a missing values percentage of 0.0%. The missing values percentages for numerical features range from 0 to 0 with mean 0.00 and standard deviation 0.00.\n The target column has 3 classes with entropy value 1.37 showing a Unbalanced dataset. Among the 7121 samples the target ground-truth class has changed 1162 tim

In [35]:
test_responses=[]

# get all test data inference result

for test_prompt in valid_dataset['text']:

  inputs= tokenizer(

  [test_prompt], return_tensors = "pt").to("cuda")



  outputs = model.generate(**inputs, max_new_tokens = 10, use_cache = True)

  test_responses.append(tokenizer.batch_decode(outputs))


In [37]:
df = valid_dataset.to_pandas()
df['model_responses']= test_responses
df.head()

Unnamed: 0,dataset_name,series_description,algorithm,hyperparameters,text,model_responses
0,1031-8-2-1-2-classification.csv,A multivariate classification time-series data...,AdaboostClassifier,{'estimator': DecisionTreeClassifier(max_depth...,Below is a description for a time series data....,[<|begin_of_text|>Below is a description for a...
1,1031-40-2-1-5-classification.csv,A multivariate classification time-series data...,XGBoostClassifier,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",Below is a description for a time series data....,[<|begin_of_text|>Below is a description for a...
2,1030-503-classification.csv,A multivariate classification time-series data...,ElasticNetClassifier,"{'C': 1000.0, 'l1_ratio': 0.0001, 'penalty': '...",Below is a description for a time series data....,[<|begin_of_text|>Below is a description for a...
3,1028-32-classification.csv,A multivariate classification time-series data...,XGBoostClassifier,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",Below is a description for a time series data....,[<|begin_of_text|>Below is a description for a...
4,1031-21-2-1-3-classification.csv,A multivariate classification time-series data...,AdaboostClassifier,{'estimator': DecisionTreeClassifier(max_depth...,Below is a description for a time series data....,[<|begin_of_text|>Below is a description for a...


In [38]:
from difflib import get_close_matches

# List of valid algorithm names
valid_algorithms = [
    'GaussianProcessClassifier',
    'ElasticNetClassifier',
    'LassoClassifier',
    'AdaboostClassifier',
    'XGBoostClassifier',
    'RandomForestClassifier',
    'SVC',
    'LightgbmClassifier'
]

# Ensure you are comparing the predicted algorithm names with the actual algorithm names
predictions = []

for response in test_responses:
    try:
        # Extracting the predicted algorithm from the response text
        if '### RESPONSE:' in response[0]:
            # Split at "### RESPONSE:" to isolate the relevant part
            response_text = response[0].split('### RESPONSE:')[1]
            # Remove unwanted tokens like '</s>' and extra newlines
            response_text = response_text.replace('</s>', '').strip()
            # Extract the first valid word (algorithm name)
            predicted_algo = response_text.split()[0]

            # Validate and correct the predicted algorithm name
            predicted_algo = get_close_matches(predicted_algo, valid_algorithms, n=1, cutoff=0.5)
            predicted_algo = predicted_algo[0] if predicted_algo else ""  # Take closest match or empty
        else:
            predicted_algo = ""  # Handle cases where "### RESPONSE:" is missing

        predictions.append(predicted_algo)
    except Exception as e:
        print(f"Error parsing response: {response}, Error: {e}")
        predictions.append("")  # Append an empty string for invalid responses

# Convert actual_data to a list if it's a DataFrame column
actual_data = df['algorithm'].tolist()

# Debugging Outputs
outputs = model.generate(**inputs, max_new_tokens=10, use_cache=True)
decoded_output = tokenizer.batch_decode(outputs)
print("\nDecoded Output:", decoded_output)  # Debug decoded output

# Compute the accuracy by comparing the predictions to the actual algorithm names
accuracy = sum(1 for true, pred in zip(actual_data, predictions) if true == pred) / len(actual_data)
print("Accuracy:", accuracy)


from sklearn.metrics import precision_score, recall_score, f1_score

recall = recall_score(actual_data, predictions, average='weighted')
f1 = f1_score(actual_data, predictions, average='weighted')

print("Recall:", recall)
print("F1 Score:", f1)

# Debug the first few predictions and actual values
print("\nActual Data (First 5):", actual_data[:5])
print("Predictions (First 5):", predictions[:5])



Decoded Output: ['<|begin_of_text|>Below is a description for a time series data. Write a response that gives the name of the best fitting machine learning algorithm in one word without explanation.\n\nThe best algorithm name should be one of this search space algorithms: AdaboostClassifier, ElasticNetClassifier,  LassoClassifier,  LightgbmClassifier, SVC, GaussianProcessClassifier, RandomForestClassifier or  XGBoostClassifier.\n\n\n\n### DESCRIPTION:\n\nA multivariate classification time-series dataset consists of 7477 samples and 15 features with 7 numerical and 8 categorical features. Each instance has a window length of 24. The dataset has a sampling rate of 60.0 minutes. The dataset has a missing values percentage of 0.0%. The missing values percentages for numerical features range from 0 to 0 with mean 0.00 and standard deviation 0.00. Similarly, the missing values percentages for categorical features range from 0 to 0 with mean 0.0 and standard deviation 0.0.\n The target colum

# save tuned model



In [24]:
# %%capture
# !pip install transformers huggingface_hub

[2024-11-17 18:37:13 +0000] [188] [INFO] Handling signal: int
[2024-11-17 18:37:13 +0000] [191] [INFO] Worker exiting (pid: 191)
[2024-11-17 18:37:13 +0000] [192] [INFO] Worker exiting (pid: 192)
[2024-11-17 18:37:13 +0000] [190] [INFO] Worker exiting (pid: 190)
[2024-11-17 18:37:13 +0000] [193] [INFO] Worker exiting (pid: 193)


In [25]:
# from huggingface_hub import login

# login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# # Online saving on HF

# new_model_adabtor= "unsloth-Llama-tuned"

# model.push_to_hub(new_model_adabtor)

# tokenizer.push_to_hub(new_model_adabtor)