In [None]:
!pip install datasets



In [None]:
import pandas as pd
from datasets import Dataset

In [None]:
columns = ['date', 'sum', 'text', 'label']
train = pd.read_csv('/content/payments_training.tsv', sep='\t', names=columns)

In [None]:
train

Unnamed: 0,date,sum,text,label
1,07.11.2024,15300.00,За участие в конференции в г. Майкоп по догово...,SERVICE
2,07.11.2024,4020000,За оказание услуг по договору №79-02726В от 01...,SERVICE
3,07.11.2024,1440-00,Оплата за Порошок стиральный Ariel Color autom...,NON_FOOD_GOODS
4,07.11.2024,240000000-00,Возврат денежных средств по договору займа №04...,LOAN
5,07.11.2024,1360000.00,"Оплата Дог №452 от 13/03/2021, согл. Сч 0745-2...",NOT_CLASSIFIED
...,...,...,...,...
496,07.11.2024,2610.00,Оплата налогов,TAX
497,07.11.2024,31200-00,Комиссия за выполнение функций агента валютног...,BANK_SERVICE
498,07.11.2024,18200-00,За тур.поездку по договору №75-04243Г от 24/04...,SERVICE
499,07.11.2024,287000000,"Оплата по договору №095 от 24.02.2025г, счету ...",FOOD_GOODS


In [None]:
dataset = pd.DataFrame(columns=['Instruction', 'Input', 'Response'])

In [None]:
dataset['Input'] = train['text']
dataset['Response'] = train['label']
dataset['Instruction'] = """
You are manager of company, whose main responsibility is client's payment statistic.
You see text - purpose of payment, which you should categorize to one of ten categories:

        NON_FOOD_GOODS - Non-food products.
        FOOD_GOODS - Food products.
        SERVICE -  Services.
        BANK_SERVICE - Banking services: loan issuance and repayment, bank fees and charges.
        TAX - Taxes, fines, other (non-banking) fees and charges, including social payments including wages.
        LOAN - Loans.
        LEASING - financial leasing
        REALE_STATE - Real estate: purchase, rental of premises, equity investment in residential construction, water transport (does not include apartments, hotels).
        NOT_CLASSIFIED - Does not fit into the above categories.

Your goal - classification of this text to one of ten categories.
Answer consist of one word - category name.
Answer in Russian."""

In [None]:
dataset

Unnamed: 0,Instruction,Input,Response
1,"\nYou are manager of company, whose main respo...",За участие в конференции в г. Майкоп по догово...,SERVICE
2,"\nYou are manager of company, whose main respo...",За оказание услуг по договору №79-02726В от 01...,SERVICE
3,"\nYou are manager of company, whose main respo...",Оплата за Порошок стиральный Ariel Color autom...,NON_FOOD_GOODS
4,"\nYou are manager of company, whose main respo...",Возврат денежных средств по договору займа №04...,LOAN
5,"\nYou are manager of company, whose main respo...","Оплата Дог №452 от 13/03/2021, согл. Сч 0745-2...",NOT_CLASSIFIED
...,...,...,...
496,"\nYou are manager of company, whose main respo...",Оплата налогов,TAX
497,"\nYou are manager of company, whose main respo...",Комиссия за выполнение функций агента валютног...,BANK_SERVICE
498,"\nYou are manager of company, whose main respo...",За тур.поездку по договору №75-04243Г от 24/04...,SERVICE
499,"\nYou are manager of company, whose main respo...","Оплата по договору №095 от 24.02.2025г, счету ...",FOOD_GOODS


In [None]:
dataset = Dataset.from_pandas(dataset)

In [None]:
dataset

Dataset({
    features: ['Instruction', 'Input', 'Response', '__index_level_0__'],
    num_rows: 500
})

In [None]:
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install wandb

Found existing installation: unsloth 2024.11.7
Uninstalling unsloth-2024.11.7:
  Successfully uninstalled unsloth-2024.11.7
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-wo_k_ssd/unsloth_ee43aa3e13834f9797df16ed5bc193b0
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-wo_k_ssd/unsloth_ee43aa3e13834f9797df16ed5bc193b0
  Resolved https://github.com/unslothai/unsloth.git to commit f26d4e739ed507de7a9088da53d10fd02f58d160
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2024.11.7-py3-none-a

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None # None for auto detection.
load_in_4bit = True # Use 4bit quantization to reduce memory usage.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit, #qlora
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.11.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["Instruction"]
    inputs       = examples["Input"]
    outputs      = examples["Response"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass


dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['Instruction', 'Input', 'Response', '__index_level_0__', 'text'],
    num_rows: 500
})

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 2, # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to="wandb"
    ),
)

Map (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.984 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 500 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 124
 "-____-"     Number of trainable parameters = 41,943,040
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhvizze[0m ([33mmisis_edu[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011118080411109885, max=1.0…

Step,Training Loss
1,2.8961
2,2.9014
3,2.8155
4,2.741
5,2.4955
6,2.2077
7,1.8814
8,1.5849
9,1.2331
10,1.0577


In [None]:
hf_token = "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"
hf_username = "Hvixze"

In [None]:
model.push_to_hub(f"{hf_username}/biv_hack_llama3_2ep", token = hf_token) # Online saving
tokenizer.push_to_hub(f"{hf_username}/biv_hack_llama3_2ep", token = hf_token)

README.md:   0%|          | 0.00/587 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Hvixze/biv_hack_llama3_2ep


tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
from pprint import pprint
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
  """You are manager of company, whose main responsibility is client's payment statistic.
You see text - purpose of payment, which you should categorize to one of ten categories:

        NON_FOOD_GOODS - Non-food products.
        FOOD_GOODS - Food products.
        SERVICE -  Services.
        BANK_SERVICE - Banking services: loan issuance and repayment, bank fees and charges.
        TAX - Taxes, fines, other (non-banking) fees and charges, including social payments including wages.
        LOAN - Loans.
        LEASING - financial leasing
        REALE_STATE - Real estate: purchase, rental of premises, equity investment in residential construction, water transport (does not include apartments, hotels).
        NOT_CLASSIFIED - Does not fit into the above categories.

Your goal - classification of this text to one of ten categories.
Answer consist of one word - category name.
Answer in Russian.""", # instruction

  """
  Оплата за Порошок стиральный Ariel Color automat 3кг по счету 89649723803465939448 от 14 августа 2024г Сумма 1440-00
  """, # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
pprint(tokenizer.batch_decode(outputs))

['<|begin_of_text|>Below is an instruction that describes a task, paired with '
 'an input that provides further context. Write a response that appropriately '
 'completes the request.\n'
 '\n'
 '### Instruction:\n'
 "You are manager of company, whose main responsibility is client's payment "
 'statistic.\n'
 'You see text - purpose of payment, which you should categorize to one of ten '
 'categories:\n'
 '\n'
 '        NON_FOOD_GOODS - Non-food products.\n'
 '        FOOD_GOODS - Food products.\n'
 '        SERVICE -  Services.\n'
 '        BANK_SERVICE - Banking services: loan issuance and repayment, bank '
 'fees and charges.\n'
 '        TAX - Taxes, fines, other (non-banking) fees and charges, including '
 'social payments including wages.\n'
 '        LOAN - Loans.\n'
 '        LEASING - financial leasing\n'
 '        REALE_STATE - Real estate: purchase, rental of premises, equity '
 'investment in residential construction, water transport (does not include '
 'apartments, hotels

In [None]:
instruction = """You are manager of company, whose main responsibility is client's payment statistic.
You see text - purpose of payment, which you should categorize to one of ten categories:

        NON_FOOD_GOODS - Non-food products.
        FOOD_GOODS - Food products.
        SERVICE -  Services.
        BANK_SERVICE - Banking services: loan issuance and repayment, bank fees and charges.
        TAX - Taxes, fines, other (non-banking) fees and charges, including social payments including wages.
        LOAN - Loans.
        LEASING - financial leasing
        REALE_STATE - Real estate: purchase, rental of premises, equity investment in residential construction, water transport (does not include apartments, hotels).
        NOT_CLASSIFIED - Does not fit into the above categories.

Your goal - classification of this text to one of ten categories.
Answer consist of one word - category name.
Answer in Russian. """

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}
### Response:
{}"""

In [None]:
import re
def predict(row: pd.Series):


    FastLanguageModel.for_inference(model)

    prompt_string = alpaca_prompt.format(
    instruction,
    row["text"],
    ""
)
    inputs = tokenizer(
        [prompt_string],
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
    decode = tokenizer.batch_decode(outputs)
    match = re.search(r'### Response:\n(.*?)(?=<\|end_of_text\|>)', decode[0], re.DOTALL)
    if match is not None:
        response = match.group(1).strip()
        return response
    else:
        return "Underfined"



In [None]:
train.iloc[0]['text']

'За участие в конференции в г. Майкоп по договору 88.367840-ДВ от 11 апреля 2026'

In [None]:
predict(train.iloc[0])

'SERVICE'

In [None]:
train["prediction"] = train.apply(lambda row: predict(row), axis = 1)

In [None]:
train

Unnamed: 0,date,sum,text,label,prediction
1,07.11.2024,15300.00,За участие в конференции в г. Майкоп по догово...,SERVICE,SERVICE
2,07.11.2024,4020000,За оказание услуг по договору №79-02726В от 01...,SERVICE,SERVICE
3,07.11.2024,1440-00,Оплата за Порошок стиральный Ariel Color autom...,NON_FOOD_GOODS,NON_FOOD_GOODS
4,07.11.2024,240000000-00,Возврат денежных средств по договору займа №04...,LOAN,LOAN
5,07.11.2024,1360000.00,"Оплата Дог №452 от 13/03/2021, согл. Сч 0745-2...",NOT_CLASSIFIED,NOT_CLASSIFIED
...,...,...,...,...,...
496,07.11.2024,2610.00,Оплата налогов,TAX,TAX
497,07.11.2024,31200-00,Комиссия за выполнение функций агента валютног...,BANK_SERVICE,BANK_SERVICE
498,07.11.2024,18200-00,За тур.поездку по договору №75-04243Г от 24/04...,SERVICE,SERVICE
499,07.11.2024,287000000,"Оплата по договору №095 от 24.02.2025г, счету ...",FOOD_GOODS,FOOD_GOODS


In [None]:
train.apply(lambda x: 1 if x['prediction'] == x['label'] else 0, axis=1).sum()/len(train)

0.994

In [None]:
main = pd.read_csv("/content/payments_main.tsv", sep = "\t" ,names = ["id", "date", "sum", "text"])

In [None]:
main

Unnamed: 0,id,date,sum,text
0,1,07.11.2024,40500.00,За тур.поездку по договору №001 от 27.01.2023г
1,2,07.11.2024,3260000,За оказание услуг по договору №53Б-02746 от 23...
2,3,07.11.2024,4710-00,Оплата штрафа
3,4,07.11.2024,30900-00,Лечение по договору №Д-00359/24 от 08.03.2025
4,5,07.11.2024,13200.00,Оплата основного долга за период с 16.12.2024г...
...,...,...,...,...
24995,24996,07.11.2024,330000000,Оплата по договору №Е01905 от 25.02.2023. сырь...
24996,24997,07.11.2024,227000,гос.услуга
24997,24998,07.11.2024,1750000.00,Предоставление кредита по договору №Д-00803/03...
24998,24999,07.11.2024,3560.00,Оплата гос. пошлины


In [None]:
main_head_0_3500 = main.head(3500).apply(lambda row: predict(row), axis = 1)

In [None]:
main_head_0_3500.to_csv("main_head_0_3500_prediction.csv", index = False)

In [None]:
main_head_0_3500

Unnamed: 0,0
0,SERVICE
1,SERVICE
2,TAX
3,SERVICE
4,BANK_SERVICE
...,...
3495,NON_FOOD_GOODS
3496,NOT_CLASSIFIED
3497,FOOD_GOODS
3498,SERVICE
