In [None]:
!pip install -q transformers torch accelerate bitsandbytes
!pip install -q sentencepiece protobuf

!pip install -q rich

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
from rich import print as rprint

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9  # в ГБ
    rprint(f"✅ GPU доступен: [bold green]{gpu_name}[/bold green]")
    rprint(f"🎯 Общая видеопамять: [bold blue]{gpu_memory:.1f} ГБ[/bold blue]")
else:
    rprint("❌ GPU не доступен. Рекомендую подключить GPU в настройках Colab: Runtime → Change runtime type → GPU")

In [None]:
import zipfile
import os
from glob import glob
from rich import print as rprint
import json

def extract_and_load_jsons(zip_path="/content/jsons.zip", extract_to="/content"):
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        rprint(f"✅ Архив распакован в папку: {extract_to}")
    except Exception as e:
        rprint(f"❌ Ошибка распаковки: {e}")
        return []

    # ИСПРАВЛЕННЫЙ ПУТЬ - убрал лишнюю папку "jsons"
    json_files = glob(os.path.join(extract_to, "*.json"))

    if not json_files:
        rprint("❌ JSON файлы не найдены в корне content/")
        # Ищем рекурсивно во всех подпапках
        json_files = glob(os.path.join(extract_to, "**", "*.json"), recursive=True)
        rprint(f"🔍 Найдены файлы в других местах: {json_files}")

    all_data = []

    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)

            if isinstance(data, list):
                all_data.extend(data)
            else:
                all_data.append(data)

            rprint(f"✅ Загружен: {json_file}")

        except Exception as e:
            rprint(f"❌ Ошибка загрузки {json_file}: {e}")

    rprint(f"📊 Всего загружено записей: {len(all_data)}")
    return all_data

artem_data = extract_and_load_jsons("/content/jsons.zip", "/content")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from rich import print as rprint

model_name = "Qwen/Qwen1.5-7B-Chat"

rprint(f"🚀 Загружаем математическую модель: [bold cyan]{model_name}[/bold cyan]")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        trust_remote_code=True,
        device_map="auto",
        torch_dtype=torch.float16
    )

rprint("✅ [bold green]Модель успешно загружена![/bold green]")

if torch.cuda.is_available():
    memory_allocated = torch.cuda.memory_allocated(0) / 1e9
    memory_reserved = torch.cuda.memory_reserved(0) / 1e9

    rprint(f"📊 Память выделено: [bold yellow]{memory_allocated:.1f} ГБ[/bold yellow]")
    rprint(f"📈 Память зарезервировано: [bold yellow]{memory_reserved:.1f} ГБ[/bold yellow]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.54G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

In [None]:
def ask_math_deepseek(question, max_length=1500):

    messages = [
        {"role": "user", "content": question}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=max_length,
            temperature=0.3,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
            top_p=0.9
        )

    response = outputs[0][inputs.shape[1]:]
    answer = tokenizer.decode(response, skip_special_tokens=True)

    return answer

rprint("🧮 [bold green]Математическая модель готова к вопросам![/bold green]")

In [None]:
def simple_artem_format(json_data):
    """ИСПРАВЛЕННАЯ версия - ВСЕ примеры с правильными ответами"""
    training_data = []

    for i, item in enumerate(json_data[:30]):  # Только 30 примеров для начала

        # Input - реальные данные
        input_text = f"""Crystal structure data:
material: {item.get('material', 'Unknown')}
density: {item.get('density', 'Unknown')} g/cm³
lattice_type: {item.get('lattice_type', 'Unknown')}
temperature: {item.get('temperature', 'Unknown')} K
atomic_mass: {item.get('atomic_mass', 'Unknown')}"""

        instruction = "Analyze this crystal structure data"

        # ВАЖНО: ВСЕГДА давать правильный output для обучения
        output = f"This is {item.get('material', 'a material')} with {item.get('lattice_type', 'a')} crystal structure. "
        output += f"Density of {item.get('density', 'N/A')} g/cm³ suggests "

        # Добавляем простую логику для density
        try:
            density_val = float(item.get('density', 0))
            if density_val < 3:
                output += "light atomic packing. "
            elif density_val < 6:
                output += "moderate atomic packing. "
            else:
                output += "dense atomic packing. "
        except:
            output += "atomic packing. "

        output += f"The structure is studied at {item.get('temperature', 'N/A')} K."

        training_data.append({
            "instruction": instruction,
            "input": input_text,
            "output": output
        })

    rprint(f"✅ Created {len(training_data)} training examples (ВСЕ с правильными ответами)")
    return training_data

In [None]:
rprint("[bold green]🎯 Загружаем JSON данные от Артема...[/bold green]")

if artem_data:
    rprint(f"✅ Успешно загружено {len(artem_data)} JSON записей от Артема")

    rprint("\n🔍 [bold blue]Структура первого JSON:[/bold blue]")
    first_item = artem_data[0] if isinstance(artem_data, list) else artem_data
    for key, value in first_item.items():
        rprint(f"   {key}: {value}")

    training_data = simple_artem_format(artem_data)
    generated_data = training_data


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from rich import print as rprint

open_models = {
    "1": "microsoft/DialoGPT-large",
    "2": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "3": "HuggingFaceH4/zephyr-7b-beta",
    "4": "mosaicml/mpt-7b-chat",
    "5": "lmsys/vicuna-7b-v1.5"
}

rprint("[bold green]🎯 Выбери модель (100% открытые):[/bold green]")
for key, model in open_models.items():
    rprint(f"   {key}. {model}")

chosen_model = "Qwen/Qwen1.5-4B-Chat"
rprint(f"🚀 Загружаем: [bold cyan]{chosen_model}[/bold cyan]")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

small_tokenizer = AutoTokenizer.from_pretrained(chosen_model)
small_tokenizer.pad_token = small_tokenizer.eos_token

small_model = AutoModelForCausalLM.from_pretrained(
        chosen_model,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True
    )

rprint("✅ [bold green]Маленькая модель успешно загружена![/bold green]")

if torch.cuda.is_available():
    memory_used = torch.cuda.memory_allocated(0) / 1e9
    rprint(f"📊 Память занято: [bold yellow]{memory_used:.1f} ГБ[/bold yellow]")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.91G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

In [None]:
from datasets import Dataset
import json

rprint("🔄 Используем сгенерированные данные из JSON Артема...")

if 'generated_data' in locals() and generated_data:
    training_data = generated_data
    rprint(f"✅ Используем {len(training_data)} примеров из данных Артема")

    rprint("\n🔍 [bold blue]Пример данных для обучения:[/bold blue]")
    if training_data and len(training_data) > 0:
        example = training_data[0]
        rprint(f"📝 Instruction: {example.get('instruction', 'N/A')}")
        rprint(f"📥 Input: {example.get('input', 'N/A')[:100]}...")
        rprint(f"📤 Output: {example.get('output', 'N/A')[:100]}...")

In [None]:
def test_small_model_with_real_data(question, example_data=None):
    if example_data is None and artem_data:
        example_data = artem_data[0]

    if example_data:
        data_preview = "Crystal structure data:\n"
        for key, value in list(example_data.items())[:5]:
            data_preview += f"- {key}: {value}\n"

        full_question = f"{question}\n\n{data_preview}"
    else:
        full_question = question

    # Используем правильный формат для Qwen
    messages = [
        {"role": "user", "content": full_question}
    ]

    prompt = small_tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = small_tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=1024
    ).to(small_model.device)

    with torch.no_grad():
        outputs = small_model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.3,
            do_sample=True,
            pad_token_id=small_tokenizer.eos_token_id,
            repetition_penalty=1.1,
            top_p=0.9
        )

    response = outputs[0][inputs.input_ids.shape[1]:]
    answer = small_tokenizer.decode(response, skip_special_tokens=True)
    return answer

rprint("🧪 [bold blue]Исправленный тест ДО обучения:[/bold blue]")

if artem_data:
    test_question = "Analyze this crystal structure data and describe what you see:"
    rprint(f"❓ Вопрос: {test_question}")

    example = artem_data[0]
    rprint("📊 Реальные данные из первого JSON:")
    for key, value in list(example.items())[:3]:
        rprint(f"   {key}: {value}")

    before_training = test_small_model_with_real_data(test_question, example)
    print(f"🤖 Ответ: {before_training}")
else:
    test_question = "What is crystal structure analysis?"
    before_training = test_small_model_with_real_data(test_question)
    print(f"🤖 Ответ: {before_training}")

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

rprint("⚙️ Пересоздаем модель с LoRA...")

import torch
torch.cuda.empty_cache()

small_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen1.5-4B-Chat",
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
)

small_model = prepare_model_for_kbit_training(small_model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

small_model = get_peft_model(small_model, lora_config)
rprint("✅ LoRA применена!")

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
def format_for_vicuna(example):
    # ПРОСТОЙ текстовый формат без специальных токенов
    text = f"Instruction: {example['instruction']}\nInput: {example['input']}\nResponse: {example['output']}"
    return {"text": text}

dataset = Dataset.from_list(training_data)
dataset = dataset.map(format_for_vicuna)

def tokenize_function(examples):
    # Токенизируем с padding и добавляем labels
    tokenized = small_tokenizer(
        examples["text"],
        padding=True,  # ВКЛЮЧАЕМ padding
        truncation=True,
        max_length=512,  # УМЕНЬШАЕМ
        return_tensors=None,
    )

    # ВАЖНО: копируем input_ids в labels
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names,
)

rprint(f"✅ Создан tokenized_dataset: {len(tokenized_dataset)} примеров")

rprint("🎛️ Настраиваем тренировку с токенизированными данными...")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=small_tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

training_args = TrainingArguments(
    output_dir="./physics-finetuned-model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,  # УМЕНЬШАЕМ
    num_train_epochs=1,
    learning_rate=2e-5,  # СИЛЬНО УМЕНЬШАЕМ learning rate
    fp16=True,
    logging_steps=5,
    save_steps=100,
    remove_unused_columns=False,
    warmup_ratio=0.1,  # Используем ratio вместо steps
    max_grad_norm=0.3,  # УМЕНЬШАЕМ
    report_to=None,
    dataloader_pin_memory=False,
    optim="adamw_torch",  # ПРОСТОЙ оптимизатор
)

trainer = Trainer(
    model=small_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

rprint("✅ [bold green]Тренер настроен![/bold green]")

In [None]:
!pip uninstall -y wandb
!pip install transformers torch accelerate bitsandbytes sentencepiece protobuf rich datasets peft

Found existing installation: wandb 0.22.2
Uninstalling wandb-0.22.2:
  Successfully uninstalled wandb-0.22.2


In [None]:
# ПРОВЕРКА: тестируем модель ДО обучения
rprint("🧪 ПРОВЕРКА ДО ОБУЧЕНИЯ:")

def quick_test(question):
    messages = [{"role": "user", "content": question}]
    prompt = small_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = small_tokenizer(prompt, return_tensors="pt").to(small_model.device)

    with torch.no_grad():
        outputs = small_model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=small_tokenizer.eos_token_id
        )

    response = outputs[0][inputs.input_ids.shape[1]:]
    return small_tokenizer.decode(response, skip_special_tokens=True)

test_before = quick_test("Hello, how are you?")
rprint(f"🤖 Ответ ДО обучения: {test_before}")

if len(test_before.strip()) < 5:  # Если ответ слишком короткий
    rprint("❌ Модель сломана ДО обучения! Прерываем.")
    # Здесь можно выйти или перезагрузить модель
else:
    rprint("✅ Модель работает нормально, продолжаем обучение...")

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
rprint("🚀 [bold green]ЗАПУСКАЕМ ОБУЧЕНИЕ![/bold green]")
rprint("⏳ Ожидай прогресс-бар...")

trainer.train()

rprint("🎉 [bold green]ОБУЧЕНИЕ ЗАВЕРШЕНО![/bold green]")

trainer.save_model()
small_tokenizer.save_pretrained("./physics-finetuned-model")
rprint("💾 Модель сохранена в './physics-finetuned-model'")

In [16]:
rprint("\n" + "="*80)
rprint("🧪 [bold green]ТЕСТИРУЕМ МОДЕЛЬ ПОСЛЕ ОБУЧЕНИЯ[/bold green]")
rprint("="*80)

def test_model(question):
    # ПЕРЕВОДИМ МОДЕЛЬ В РЕЖИМ ОЦЕНКИ
    small_model.eval()

    messages = [
        {"role": "user", "content": question}
    ]

    prompt = small_tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = small_tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512  # УМЕНЬШАЕМ
    ).to(small_model.device)

    with torch.no_grad():
        outputs = small_model.generate(
            **inputs,
            max_new_tokens=200,  # УМЕНЬШАЕМ
            temperature=0.7,
            do_sample=True,
            pad_token_id=small_tokenizer.eos_token_id,
            repetition_penalty=1.1,
            eos_token_id=small_tokenizer.eos_token_id
        )

    response = outputs[0][inputs.input_ids.shape[1]:]
    return small_tokenizer.decode(response, skip_special_tokens=True)

rprint("\n📝 [bold blue]Тест 1: Сравнение с ДО обучения[/bold blue]")
test_question = "You are analyzing crystallographic data. Based on the typical data patterns you've learned, describe what kind of information is usually contained in crystal structure files and what physical properties can be derived from them."
rprint(f"❓ Question: {test_question}")
after_training = test_model(test_question)
print(f"🤖 Answer: {after_training}")

rprint("\n📝 [bold blue]Тест 2: Анализ реальных данных[/bold blue]")
if artem_data and len(artem_data) > 0:
    test_item = artem_data[0]
    test_input = "Data about material:\n"
    for key, value in list(test_item.items())[:5]:
        test_input += f"- {key}: {value}\n"

    test_question2 = f"Analyze data about material:\n{test_input}"
    rprint(f"❓ Question: Analyze data about material")
    rprint(f"📊 Data: {str(list(test_item.keys())[:3])}...")
    after_training2 = test_model(test_question2)
    print(f"🤖 Answer: {after_training2}")

rprint("\n📝 [bold blue]Тест 3: Вопрос из тренировочных данных[/bold blue]")
if training_data and len(training_data) > 0:
    test_question3 = f"{training_data[0]['instruction']}\n{training_data[0]['input']}"
    rprint(f"❓ Question: {training_data[0]['instruction']}")
    rprint(f"📊 Data: {training_data[0]['input'][:100]}...")
    after_training3 = test_model(test_question3)
    print(f"🤖 Answer: {after_training3}")

rprint("\n📝 [bold blue]Тест 4: Новый вопрос о данных[/bold blue]")
test_question4 = "Considering a material with face-centered cubic structure and density around 2.7 g/cm³, what would be its typical physical characteristics and industrial applications??"
rprint(f"❓ Question: {test_question4}")
after_training4 = test_model(test_question4)
print(f"🤖 Answer: {after_training4}")

print("\n" + "="*80)
rprint("🎯 [bold green]ТЕСТИРОВАНИЕ ЗАВЕРШЕНО![/bold green]")

🤖 Answer: Crystal structure files contain information about the arrangement of atoms within a crystal lattice. The file typically includes the following components:

1. Crystal system: This indicates whether the crystal is cubic, tetragonal, hexagonal, or other.
2. Space group: This defines the symmetry of the crystal lattice and specifies how the atoms are arranged relative to each other.
3. Atom positions: This provides the coordinates of all the atoms in the crystal, including their x, y, and z coordinates.
4. Bond lengths: This gives the distance between adjacent atoms along each bond direction (x, y, or z).
5. Bond angles: This gives the angle between two bonds that share an atom.
6. Volume: This gives the total volume of the crystal.

The physical properties that can be derived from crystal structure files include:

1. Crystal size: This can be determined by measuring the length, width, and height of the crystal.
2. Density: This can be calculated using the formula density =


🤖 Answer: Based on the given data, we can analyze the following:

1. Atomic mass: The atomic mass of aluminum is 26.9815 u (unified atomic mass units). This means that one atom of aluminum contains 26.9815 u of matter.

2. Density: The density of aluminum is 2.7214 g/cm³. This value represents the amount of matter per unit volume of aluminum. In other words, it tells us how much weight an object made of aluminum will have for a given volume.

3. Temperature: The temperature provided is 293.29 K. This value is commonly used in thermodynamics and chemistry to describe the physical state of a substance. At this temperature, aluminum is considered to be in its liquid phase.

4. Lattice type: Aluminum has a face-centered cubic (FCC) crystal structure. This means that each atom in the lattice is surrounded by four other atoms at the corners


🤖 Answer: Aluminum is a transition metal with the chemical symbol Al and atomic number 13. It has a density of 2.7214 g/cm³ at room temperature, which means that it is relatively dense compared to other materials.
The crystal structure of aluminum is an fcc lattice, meaning that each atom in the material is surrounded by four other atoms in a regular pattern. This type of lattice is common for many metals, including iron, steel, and copper.
At room temperature, the lattice energy of aluminum is low enough that it can exist as a solid without undergoing significant changes. However, at higher temperatures, the lattice energy increases and aluminum undergoes phase transitions, such as melting or boiling.
The atomic mass of aluminum is 26.9815 amu, which means that it contains 13 protons and 14 neutrons. The atomic number of aluminum indicates that it is a transition metal because it has one valence electron that is easily lost


🤖 Answer: A material with face-centered cubic (FCC) structure has a unit cell volume of 4 x 4 x 4 Angstroms^3. This means that the lattice spacing is 0.4 Angstroms.
The density of an FCC crystal is typically close to 2.7 g/cm³, which is a bit higher than the average density of water. This high density makes it useful for applications such as making strong, durable materials like steel and titanium.
One common application of FCC crystals is in the production of computer chips. The tiny grains of metal used to make these chips have an FCC structure, which allows them to be processed using techniques like photolithography.
Another application of FCC crystals is in the manufacturing of optical fibers. These fibers have an FCC structure, which helps them transmit light more efficiently than other types of fibers.
Overall, the high density and unique properties of FCC crystals make them useful in a wide range of applications, from electronics to optics.

