<a href="https://colab.research.google.com/github/AstroBesat-SoftW/AstroBesat-SoftW/blob/main/2_makine_egit_agirlik_olustur.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# DOSYA ADI: egitim.py
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle # Kayıt işlemleri için
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, recall_score, cohen_kappa_score

# --- 1. ADIM: TEKNOFEST ÜNİVERSİTE VERİSİ OLUŞTURMA ---
print(">> Veri üretiliyor (20.000 Adet)...")

def veri_uret(n):
    dna_seqs, prot_seqs, bio_feats, labels = [], [], [], []
    amino_acids = list("ACDEFGHIKLMNPQRSTVWY")

    for _ in range(n):
        # Genomik (DNA) ve Proteomik (Amino Asit) Komşuluk (11 birim)
        dna = ''.join(np.random.choice(list('ACGT'), size=11))
        prot = ''.join(np.random.choice(amino_acids, size=11))

        # Sayısal Özellikler (Risk, MAF, Korunmuşluk, Hidro, Polarite, Ağırlık)
        risk = np.random.beta(2, 2)
        maf = np.random.exponential(0.05)
        cons = np.random.uniform(0, 10)
        hydro = np.random.uniform(-5, 5)
        polar = np.random.uniform(-3, 3)
        weight = np.random.uniform(-50, 50)

        # Etiketleme Kuralı (Simülasyon)
        score = (risk * 0.4) + (cons/10 * 0.2) + ((0.5 - maf)*2 * 0.2) + (abs(hydro)/5 * 0.2)
        score += np.random.normal(0, 0.05)
        label = 1 if score > 0.65 else 0

        dna_seqs.append(dna)
        prot_seqs.append(prot)
        bio_feats.append([risk, maf, cons, hydro, polar, weight])
        labels.append(label)

    return dna_seqs, prot_seqs, np.array(bio_feats), np.array(labels)

dna_data, prot_data, num_data, y = veri_uret(20000)

# --- 2. ADIM: VERİYİ İŞLEME VE HAZIRLAMA ---
# DNA Tokenizer
dna_tok = Tokenizer(char_level=True)
dna_tok.fit_on_texts(dna_data)
X_dna = pad_sequences(dna_tok.texts_to_sequences(dna_data), maxlen=11, padding='post')

# Protein Tokenizer
prot_tok = Tokenizer(char_level=True)
prot_tok.fit_on_texts(prot_data)
X_prot = pad_sequences(prot_tok.texts_to_sequences(prot_data), maxlen=11, padding='post')

# Eğitim/Test Ayrımı
X_dna_tr, X_dna_ts, X_prot_tr, X_prot_ts, X_num_tr, X_num_ts, y_tr, y_ts = train_test_split(
    X_dna, X_prot, num_data, y, test_size=0.2, random_state=42
)

# --- 3. ADIM: MODEL MİMARİSİ (3 GİRDİLİ) ---
# Giriş 1: DNA
in_dna = Input(shape=(11,))
emb_dna = Embedding(len(dna_tok.word_index)+1, 8)(in_dna)
x1 = GlobalMaxPooling1D()(Conv1D(16, 3, activation='relu')(emb_dna))

# Giriş 2: Protein
in_prot = Input(shape=(11,))
emb_prot = Embedding(len(prot_tok.word_index)+1, 8)(in_prot)
x2 = GlobalMaxPooling1D()(Conv1D(16, 3, activation='relu')(emb_prot))

# Giriş 3: Sayısal Veriler
in_num = Input(shape=(6,))
x3 = BatchNormalization()(Dense(16, activation='relu')(in_num))

# Birleştirme
merged = Concatenate()([x1, x2, x3])
z = Dropout(0.4)(Dense(64, activation='relu')(merged))
out = Dense(1, activation='sigmoid')(z)

model = Model(inputs=[in_dna, in_prot, in_num], outputs=out)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# --- 4. ADIM: EĞİTİM ---
print(">> Model Eğitiliyor...")
model.fit([X_dna_tr, X_prot_tr, X_num_tr], y_tr, epochs=10, batch_size=64, validation_split=0.1, verbose=1)

# --- 5. ADIM: RAPORLAMA ---
print("\n>> Performans Hesaplanıyor...")
preds = (model.predict([X_dna_ts, X_prot_ts, X_num_ts]) > 0.5).astype(int)

print(f"F1 Skoru : {f1_score(y_ts, preds):.4f}")
print(f"Kappa    : {cohen_kappa_score(y_ts, preds):.4f}")
print(f"Recall   : {recall_score(y_ts, preds):.4f}")

# --- 6. ADIM: KAYDETME (EN ÖNEMLİ KISIM) ---
print("\n>> Dosyalar kaydediliyor (Bir sonraki kod için)...")

# Modeli kaydet (.h5)
model.save('teknofest_beyni.h5')

# Tokenizer'ları kaydet (Harf çeviriciler)
with open('dna_sozlugu.pickle', 'wb') as f:
    pickle.dump(dna_tok, f)

with open('prot_sozlugu.pickle', 'wb') as f:
    pickle.dump(prot_tok, f)

print("✅ BİTTİ! 'teknofest_beyni.h5', 'dna_sozlugu.pickle' ve 'prot_sozlugu.pickle' oluştu.")

>> Veri üretiliyor (20.000 Adet)...
>> Model Eğitiliyor...
Epoch 1/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.6718 - loss: 0.6093 - val_accuracy: 0.7275 - val_loss: 0.5386
Epoch 2/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7613 - loss: 0.4882 - val_accuracy: 0.7181 - val_loss: 0.5601
Epoch 3/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8654 - loss: 0.3056 - val_accuracy: 0.8863 - val_loss: 0.2647
Epoch 4/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8798 - loss: 0.2723 - val_accuracy: 0.8550 - val_loss: 0.3079
Epoch 5/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8808 - loss: 0.2683 - val_accuracy: 0.8944 - val_loss: 0.2428
Epoch 6/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8865 - loss: 0.2561 - val_accur



F1 Skoru : 0.8220
Kappa    : 0.7476
Recall   : 0.8193

>> Dosyalar kaydediliyor (Bir sonraki kod için)...
✅ BİTTİ! 'teknofest_beyni.h5', 'dna_sozlugu.pickle' ve 'prot_sozlugu.pickle' oluştu.
