In [4]:
import pandas as pd
from deep_translator import GoogleTranslator
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("data/data.csv")
df.head(3)

Unnamed: 0,context,question,answer
0,CREATE TABLE head (age INTEGER),How many heads of the departments are older th...,SELECT COUNT(*) FROM head WHERE age > 56
1,"CREATE TABLE head (name VARCHAR, born_state VA...","List the name, born state and age of the heads...","SELECT name, born_state, age FROM head ORDER B..."
2,"CREATE TABLE department (creation VARCHAR, nam...","List the creation year, name and budget of eac...","SELECT creation, name, budget_in_billions FROM..."


In [3]:
df.shape

(78577, 3)

In [47]:
translator = GoogleTranslator(api_key="...",source="en", target="ru")
tqdm.pandas(desc="Перевод вопросов")

In [5]:
def translate_safe(text):
    try:
        return translator.translate(text)
    except Exception as e:
        print(f"[!] Ошибка перевода: {e}")
        return text

In [6]:
temp = df.iloc[0]["question"]
translator.translate(temp)

'Сколько голов отделов старше 56 лет?'

In [None]:
df["question_ru"] = df["question"].progress_apply(translate_safe)

In [24]:
import requests

def translate_with_ollama(text: str, model='llama3.2'):
    prompt = (
        f"Переведи следующий текст с английского на русский язык.\n"
        f"Верни только перевод без пояснений, без кавычек, без альтернатив.\n"
        f"Только сам перевод, ничего лишнего:\n\n"
        f"{text}"
    )

    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0.0,
        },
    }

    res = requests.post("http://localhost:11434/api/generate", json=payload)
    res.raise_for_status()

    return res.json()['response'].strip()

In [25]:
translate_with_ollama(temp)

'Количество глав министерств старше 56 лет.'

In [None]:
df["question_ru"] = df["question"].progress_apply(translate_safe)

In [23]:
data_translated = pd.read_csv("data/translated_dataset.csv")
data_translated.shape

(78577, 3)

In [20]:
data_translated["question_ru"].iloc[0]

'[ОШИБКА] 404 Client Error: Not Found for url: http://localhost:11434/api/generate'

In [27]:
with open("questions_for_translation.txt", "w", encoding="utf-8") as f:
    for q in df["question"]:
        f.write(q.strip() + "\n")

In [30]:
import os
import math

# === Шаг 1: Читаем файл ===
with open("questions_for_translation.txt", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

# === Шаг 2: Разбиваем на 3 части ===
chunk_size = math.ceil(len(lines) / 6)
chunks = [lines[i:i + chunk_size] for i in range(0, len(lines), chunk_size)]

# === Шаг 3: Сохраняем в отдельные txt файлы ===
os.makedirs("split_txt", exist_ok=True)

for i, chunk in enumerate(chunks, start=1):
    filename = f"split_txt/questions_part{i}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        for line in chunk:
            f.write(line + "\n")
    print(f"✅ Сохранено: {filename}")

✅ Сохранено: split_txt/questions_part1.txt
✅ Сохранено: split_txt/questions_part2.txt
✅ Сохранено: split_txt/questions_part3.txt
✅ Сохранено: split_txt/questions_part4.txt
✅ Сохранено: split_txt/questions_part5.txt
✅ Сохранено: split_txt/questions_part6.txt


In [33]:
translated_parts = [
    "split_txt/questions_part1_ru.txt",
    "split_txt/questions_part2_ru.txt",
    "split_txt/questions_part3_ru.txt",
    "split_txt/questions_part4_ru.txt",
    "split_txt/questions_part5_ru.txt",
    "split_txt/questions_part6_ru.txt"
]

all_lines = []
for part in translated_parts:
    with open(part, "r", encoding="utf-8") as f:
        all_lines.extend(f.readlines())

In [35]:
len(all_lines)

78577

In [36]:
df["translated_question"] = all_lines

In [37]:
df.shape

(78577, 4)

In [38]:
df.head(3)

Unnamed: 0,context,question,answer,translated_question
0,CREATE TABLE head (age INTEGER),How many heads of the departments are older th...,SELECT COUNT(*) FROM head WHERE age > 56,Сколько руководителей департаментов старше 56 ...
1,"CREATE TABLE head (name VARCHAR, born_state VA...","List the name, born state and age of the heads...","SELECT name, born_state, age FROM head ORDER B...","Укажите фамилию, имя, отчество и возраст руков..."
2,"CREATE TABLE department (creation VARCHAR, nam...","List the creation year, name and budget of eac...","SELECT creation, name, budget_in_billions FROM...","Укажите год создания, название и бюджет каждог..."


In [39]:
df.to_csv("data/data_translated.csv", index=False)

Test data creation

In [None]:
import sqlite3
import random
from datetime import datetime, timedelta


In [46]:
random.seed(77)


expense_categories = ["ресторан", "продукты", "развлечения", "транспорт", "медицина", "одежда"]
income_categories = ["income", "expense"]


conn = sqlite3.connect("data/transactions.db")
cursor = conn.cursor()

In [None]:
cursor.execute("""
CREATE TABLE IF NOT EXISTS transactions (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    user_id TEXT,
    type TEXT CHECK(type IN ('income', 'expense')),
    category TEXT,
    amount REAL,
    date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
conn.commit()

def populate_db(n=100000):
    for _ in range(n):
        user_id = "u1"
        ttype = random.choice(["income", "expense"])
        category = random.choice(income_categories if ttype == "income" else expense_categories)
        amount = random.randint(100, 200000)
        days_ago = random.randint(0, 1000)
        date = (datetime.now() - timedelta(days=days_ago)).strftime("%Y-%m-%d")
        cursor.execute("""
            INSERT INTO transactions (user_id, type, category, amount, date)
            VALUES (?, ?, ?, ?, ?)
        """, (user_id, ttype, category, amount, date))
    conn.commit()

populate_db()

In [3]:
data = pd.read_csv("data/financial_assistant_training_dataset.csv")

In [5]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=77)

test_df, eval_df = train_test_split(test_df, test_size=0.5, random_state=77)

In [6]:
print(f"Train df shape: {train_df.shape}")
print(f"Eval df shape: {eval_df.shape}")
print(f"Test df shape: {test_df.shape}")

Train df shape: (8000, 3)
Eval df shape: (1000, 3)
Test df shape: (1000, 3)


In [8]:
train_df.to_csv("data/fin_ass_train.csv", index=False)
eval_df.to_csv("data/fin_ass_eval.csv", index=False)
test_df.to_csv("data/fin_ass_test.csv", index=False)