In [26]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc 
import tqdm
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 100)
pd.set_option('display.max_rows', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import warnings
warnings.filterwarnings('ignore')

In [27]:
train = pd.read_csv(r'train.csv')
test = pd.read_csv(r'test.csv')
submission = pd.read_csv(r'solution_template.csv')

In [28]:
train = train.drop('patient_id', axis=1)
test = test.drop('patient_id', axis=1)

df = pd.concat([train, test], axis=0)

df.columns = df.columns.str.replace(' ', '_', regex=False)
df.columns = df.columns.str.replace('-', '_', regex=False)

In [29]:
df.drop(columns=['metastatic_first_novel_treatment_type', 'metastatic_first_novel_treatment'], inplace=True)

In [30]:
dolu_degerler_sozlugu = df.dropna(subset=['patient_state', 'Division', 'Region']).drop_duplicates('patient_zip3').set_index('patient_zip3')[['patient_state', 'Division', 'Region']].to_dict('index')

# Boş değerleri doldurun
for index, row in df.iterrows():
    if pd.isna(row['patient_state']) or pd.isna(row['Division']) or pd.isna(row['Region']):
        if row['patient_zip3'] in dolu_degerler_sozlugu:
            df.at[index, 'patient_state'] = dolu_degerler_sozlugu[row['patient_zip3']]['patient_state']
            df.at[index, 'Division'] = dolu_degerler_sozlugu[row['patient_zip3']]['Division']
            df.at[index, 'Region'] = dolu_degerler_sozlugu[row['patient_zip3']]['Region']

In [31]:
# Average ile başlayan sütunları bul
average_columns = [col for col in df.columns if col.startswith('Average')]

# Eksik değerleri patient_state ortalamaları ile doldur
for col in average_columns:
    df[col] = df.groupby('patient_state')[col].transform(lambda x: x.fillna(x.mean()))

In [32]:
# Male codes to female 
df['breast_cancer_diagnosis_code'] = df['breast_cancer_diagnosis_code'].replace({
    'C50122':'C50112', 'C50221':'C50211', 'C50421':'C50411', 'C509':'C5091', 'C50922':'C50912', 'C5021':'C50219'
})

# Fix bad zip 
df['patient_state'] = np.where(df['patient_zip3'] == 630, 'MO', np.where(df['patient_zip3'] == 864, 'AZ', df['patient_state']))

In [33]:
df['payer_type'] = df['payer_type'].fillna('Unknown')

In [34]:
df['bmi_missing'] = df['bmi'].isna().astype(int)

In [35]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_cols = df.select_dtypes(include=numerics).columns.drop('metastatic_diagnosis_period')

# 'metastatic_diagnosis_period' hariç numerik sütunlardaki eksik değerleri doldur
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

### PCA

In [36]:
age_columns = ['age_under_10', 'age_10_to_19', 'age_20s', 'age_30s', 'age_40s', 'age_50s', 'age_60s', 'age_70s', 'age_over_80']
income_columns = ['income_household_under_5','income_household_5_to_10','income_household_10_to_15','income_household_15_to_20','income_household_20_to_25',
               'income_household_25_to_35','income_household_35_to_50','income_household_50_to_75','income_household_75_to_100','income_household_100_to_150',
               'income_household_150_over']
education_columns = ['education_less_highschool', 'education_highschool', 'education_some_college', 'education_bachelors', 'education_graduate', 'education_college_or_above']
race_columns = ['race_white', 'race_black', 'race_asian', 'race_native', 'race_pacific', 'race_other', 'race_multiple', 'hispanic']
difficulties_columns = ['disabled', 'poverty', 'limited_english', 'commute_time', 'health_uninsured', 'veteran']
workforce_columns = ['labor_force_participation', 'unemployment_rate', 'self_employed', 'farmer']
marital_status = [ 'married', 'divorced', 'never_married', 'widowed']

to_pca_cols = age_columns + income_columns + education_columns + race_columns + difficulties_columns + workforce_columns + marital_status

pca_data = df[to_pca_cols]

if True:
    # Verileri ölçeklendirme
    scaler = StandardScaler()
    scaled_pca_data = scaler.fit_transform(pca_data)

    # PCA modelini oluşturma ve eğitme
    pca = PCA(n_components=2, random_state=47)  # 2 ana bileşen
    principal_components = pca.fit_transform(scaled_pca_data)

    # Ana bileşenleri DataFrame'e ekleme
    df['PCA_1'] = principal_components[:, 0]
    df['PCA_2'] = principal_components[:, 1]

    #Average cols PCA
    average_columns = [col for col in df.columns if col.startswith('Average')]

    average_data = df[average_columns]

    # Verileri ölçeklendirme
    scaler = StandardScaler()
    scaled_average_data = scaler.fit_transform(average_data)

    # PCA modelini oluşturma ve eğitme
    pca = PCA(n_components=2, random_state=77)  # 2 ana bileşen
    principal_components = pca.fit_transform(scaled_average_data)

    # Ana bileşenleri DataFrame'e ekleme
    df['PCA_AVG_1'] = principal_components[:, 0]
    df['PCA_AVG_2'] = principal_components[:, 1]

if False:
    pca_data = df[to_pca_cols]

    # Verileri ölçeklendirme
    scaler = StandardScaler()
    scaled_pca_data = scaler.fit_transform(pca_data)

    # t-SNE modelini oluşturma ve eğitme
    tsne = TSNE(n_components=2, learning_rate='auto', init='random', random_state=6)
    tsne_components = tsne.fit_transform(scaled_pca_data)

    # t-SNE bileşenlerini DataFrame'e ekleme
    df['TSNE_1'] = tsne_components[:, 0]
    df['TSNE_2'] = tsne_components[:, 1]

    #Average cols TSNE
    average_columns = [col for col in df.columns if col.startswith('Average')]

    average_data = df[average_columns]

    # Verileri ölçeklendirme
    scaler = StandardScaler()
    scaled_average_data = scaler.fit_transform(average_data)

    tsne = TSNE(n_components=2, learning_rate='auto', init='random', random_state=6)
    tsne_components = tsne.fit_transform(scaled_average_data)

    # Ana bileşenleri DataFrame'e ekleme
    df['TSNE_AVG_1'] = tsne_components[:, 0]
    df['TSNE_AVG_2'] = tsne_components[:, 1]

In [37]:
# En iyi random_state'i bulmak için değişken
best_random_state = None
best_explained_variance = 0

# Farklı random_state değerleri üzerinde deneme
for random_state in range(0, 100):
    pca = PCA(n_components=2, random_state=random_state)
    principal_components = pca.fit_transform(scaled_average_data)
    explained_variance = np.sum(pca.explained_variance_ratio_)
    
    if explained_variance > best_explained_variance:
        best_explained_variance = explained_variance
        best_random_state = random_state

print(f"En iyi random_state: {best_random_state} ile açıklanan varyans oranı: {best_explained_variance}")

En iyi random_state: 77 ile açıklanan varyans oranı: 0.9410977030599198


### breast_cancer_diagnosis_desc

In [38]:
def group_descriptions(desc):
    if 'unspecified' in desc or 'unsp site of' in desc:
        return 'unspecified'
    if 'upper-outer quadrant of' in desc:
        return 'upper-outer quadrant of'
    if 'central' in desc:
        return 'central'
    if 'ovrlp sites of' in desc or 'overlapping' in desc:
        return 'overlapping'
    if 'nipple and areola' in desc:
        return 'nipple and areola'
    if 'upper-inner' in desc:
        return 'upper-inner'
    if 'lower-outer' in desc:
        return 'lower-outer'
    if 'lower-inner' in desc:
        return 'lower-inner'
    if 'axillary tail' in desc:
        return 'axillary tail'
    return desc  
# Uygulama
df['breast_cancer_diagnosis_desc'] = df['breast_cancer_diagnosis_desc'].apply(group_descriptions)

In [39]:
df['age_group'] = pd.cut(df['patient_age'], right=False, bins=[0, 30, 40, 50, 60, 70, 80, 90, np.inf], labels=[0,1,2,3,4,5,6,7]).astype(int)
df['icd_9'] = df['breast_cancer_diagnosis_code'].str.startswith('17').astype(int)

In [40]:
df.breast_cancer_diagnosis_code.value_counts()

breast_cancer_diagnosis_code
1749      2828
C50911    2599
C50912    2577
C50919    2162
C50411    1353
C50412    1286
C50812     737
C50811     662
1744       588
1748       481
C50212     417
C50211     362
C50512     310
C50511     299
C50112     297
C50111     297
C50311     218
C50312     197
1742       147
C50012     140
1741       133
C50011     128
1745       108
1743        69
C50419      66
C50819      65
C50611      53
C5091       51
C50612      46
C50119      22
C50219      19
19881       19
1746        15
C50019      14
C50319      12
C50519      10
C5081       10
C5041        9
C5031        6
C5011        5
C5001        1
C50619       1
Name: count, dtype: int64

In [41]:
df.breast_cancer_diagnosis_code.unique()

array(['C50912', 'C50412', '1749', 'C50911', '1748', 'C50411', 'C50212',
       'C50811', 'C50919', 'C50211', '1744', 'C50012', 'C50112', 'C50812',
       'C50311', 'C50011', 'C50511', 'C50512', '1741', 'C50819', '1745',
       '1742', 'C50312', '19881', 'C50219', 'C50111', 'C50611', '1743',
       'C50612', 'C5091', 'C5011', 'C50519', 'C50319', 'C50419', 'C5041',
       'C5081', '1746', 'C5031', 'C50119', 'C5001', 'C50019', 'C50619'],
      dtype=object)

In [42]:
# ICD-10 kodlarını ICD-9 kodlarına eşleştiren bir harita
icd10_to_icd9 = {
    'C509': '1749', 'C508': '1748', 'C504': '1744', 'C501': '1741', 'C505': '1745', 'C502': '1742', 
    'C503': '1743', 'C506': '1746', 'C7981': '19881', 'C500': '1740'
}

# Eşleştirme fonksiyonu
def map_icd10_to_icd9(code):
    # ICD-10 kodlarının ilk üç karakterini kullanarak eşleştirme yapıyoruz
    icd10_prefix = code[:4]
    if icd10_prefix in icd10_to_icd9:
        return icd10_to_icd9[icd10_prefix]
    else:
        return code

# Yeni sütunu oluşturma
df['icd9_feat'] = df['breast_cancer_diagnosis_code'].apply(map_icd10_to_icd9)

In [43]:
df.icd9_feat.unique()

array(['1749', '1744', '1748', '1742', '1740', '1741', '1743', '1745',
       '19881', '1746'], dtype=object)

In [44]:
# 'metastatic_diagnosis_period' sütununda değerleri olanları 'train' DataFrame'ine ayır
train = df[df['metastatic_diagnosis_period'].notnull()]
train.reset_index(inplace=True, drop=True)

# 'metastatic_diagnosis_period' sütununda değerleri olmayanları 'test' DataFrame'ine ayır
test = df[df['metastatic_diagnosis_period'].isnull()]
test.drop('metastatic_diagnosis_period', axis=1, inplace=True)
test.reset_index(inplace=True, drop=True)

### Selection

In [45]:
train = train[['icd_9', 'patient_age', 'breast_cancer_diagnosis_desc',
               'metastatic_cancer_diagnosis_code', 'patient_state', 'payer_type', 
               'PCA_AVG_2', 'Region', 'metastatic_diagnosis_period']]

test = test[['icd_9', 'patient_age', 'breast_cancer_diagnosis_desc',
             'metastatic_cancer_diagnosis_code', 'patient_state', 'payer_type',
             'PCA_AVG_2', 'Region']]

# 'PCA_1', 'PCA_2', 'PCA_AVG_1', 'PCA_AVG_2', 'Region', 'TSNE_1', 'TSNE_2', 'TSNE_AVG_1', 'TSNE_AVG_2', 'bmi_missing', 'age_group', 'icd_9'

### Holdout (Optional)

In [46]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=['metastatic_diagnosis_period'])
y = train['metastatic_diagnosis_period']

X_train, X_eval, y_train, y_eval = train_test_split(
    X, y, 
    test_size=0.1, # Değerlendirme seti oranı
    shuffle=True,
    stratify=y,     # Hedef değişkenine göre stratify
    random_state=6 # Sabit rastgelelik
)

# Eğitim ve değerlendirme setlerini birleştir
train_split = X_train.copy()
train_split['metastatic_diagnosis_period'] = y_train

eval_split = X_eval.copy()
eval_split['metastatic_diagnosis_period'] = y_eval

# # holdout 2

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.11, # Değerlendirme seti oranı
#     # shuffle=True,
#     stratify=y,     # Hedef değişkenine göre stratify
#     random_state=61 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split2 = X_eval.copy()
# eval_split2['metastatic_diagnosis_period'] = y_eval

# # holdout 3

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.12, # Değerlendirme seti oranı
#     # shuffle=True,
#     stratify=y,     # Hedef değişkenine göre stratify
#     random_state=29 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split3 = X_eval.copy()
# eval_split3['metastatic_diagnosis_period'] = y_eval

# # holdout4

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.023, # Değerlendirme seti oranı
#     shuffle=True,
#     # stratify=y,     # Hedef değişkenine göre stratify
#     random_state=42 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split4 = X_eval.copy()
# eval_split4['metastatic_diagnosis_period'] = y_eval

# # holdout5

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.024, # Değerlendirme seti oranı
#     shuffle=True,
#     # stratify=y,     # Hedef değişkenine göre stratify
#     random_state=77 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split5 = X_eval.copy()
# eval_split5['metastatic_diagnosis_period'] = y_eval

# # holdout6

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.0245, # Değerlendirme seti oranı
#     shuffle=True,
#     # stratify=y,     # Hedef değişkenine göre stratify
#     random_state=89 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split6 = X_eval.copy()
# eval_split6['metastatic_diagnosis_period'] = y_eval

# # holdout7

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.025, # Değerlendirme seti oranı
#     shuffle=True,
#     # stratify=y,     # Hedef değişkenine göre stratify
#     random_state=77 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split7 = X_eval.copy()
# eval_split7['metastatic_diagnosis_period'] = y_eval

# # holdout8

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.024, # Değerlendirme seti oranı
#     shuffle=True,
#     # stratify=y,     # Hedef değişkenine göre stratify
#     random_state=63 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split8 = X_eval.copy()
# eval_split8['metastatic_diagnosis_period'] = y_eval

# # holdout9

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.0255, # Değerlendirme seti oranı
#     shuffle=True,
#     # stratify=y,     # Hedef değişkenine göre stratify
#     random_state=38 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split9 = X_eval.copy()
# eval_split9['metastatic_diagnosis_period'] = y_eval

# # holdout10

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.026, # Değerlendirme seti oranı
#     shuffle=True,
#     # stratify=y,     # Hedef değişkenine göre stratify
#     random_state=9 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split10 = X_eval.copy()
# eval_split10['metastatic_diagnosis_period'] = y_eval

### Optuna

In [47]:
import optuna
import numpy as np
from autogluon.tabular import TabularPredictor
import os

# Önceden eğitilmiş AutoGluon predictor
automl = TabularPredictor.load(r'AutogluonModels\ag-20240601_144927')

# Optimize edilecek modellerin listesi
models = ['XGBoost_3_BAG_L1', 'RandomForest_2_BAG_L2', 'ExtraTrees_BAG_L2', 'CatBoost_3_BAG_L2', 'LightGBM_3_BAG_L1']

# Veriyi hazırlama
X_splits = [eval_split.drop(columns=['metastatic_diagnosis_period'])]

y_splits = [eval_split['metastatic_diagnosis_period']]

# Optuna optimizasyonu
def objective(trial):
    # Ağırlıklar 0.05'in katları olmalı
    weight_options = [i / 20.0 for i in range(1, 11)]  # 0.05, 0.10, 0.15, ..., 0.5
    weights = [trial.suggest_categorical(f'weight_{i}', weight_options) for i in range(len(models))]
    weights = np.array(weights)
    weights /= weights.sum()  # Ağırlıkları normalize ederek toplamının 1 olmasını sağlıyoruz

    rmses = []
    for X, y in zip(X_splits, y_splits):
        # Seçilen modellerin holdout seti üzerindeki tahminlerini ağırlıklandırarak birleştiriyoruz
        predictions = sum(weight * automl.predict(X, model=model) for model, weight in zip(models, weights))
        predictions /= np.sum(weights)  # Normalize weights after prediction

        # Gerçek değerlerle karşılaştırarak skoru hesaplıyoruz
        rmse = np.sqrt(np.mean((predictions - y) ** 2))
        rmses.append(rmse)

    return np.mean(rmses)

# Optuna çalıştırma
study = optuna.create_study(direction='minimize')
# İlk deneme için belirli ağırlıklarla parametreleri kuyruğa alıyoruz
initial_weights = [0.35, 0.15, 0.10, 0.05, 0.35]
study.enqueue_trial({f'weight_{i}': w for i, w in enumerate(initial_weights)})

study.optimize(objective, n_trials=500)

# En iyi deneme
best_trial = study.best_trial

# En iyi ağırlıklar
best_weights = [best_trial.params[f'weight_{i}'] for i in range(len(models))]
best_weights = np.array(best_weights)
best_weights /= best_weights.sum()

print(f"En iyi RMSE: {best_trial.value}")
print(f"En iyi ağırlıklar: {best_weights}")

[I 2024-06-01 21:22:13,747] A new study created in memory with name: no-name-e40454f2-afdd-4d89-b41b-dd8aa9d596ca


[I 2024-06-01 21:22:18,763] Trial 0 finished with value: 82.31514338082452 and parameters: {'weight_0': 0.35, 'weight_1': 0.15, 'weight_2': 0.1, 'weight_3': 0.05, 'weight_4': 0.35}. Best is trial 0 with value: 82.31514338082452.
[I 2024-06-01 21:22:23,879] Trial 1 finished with value: 82.33541297374742 and parameters: {'weight_0': 0.3, 'weight_1': 0.1, 'weight_2': 0.5, 'weight_3': 0.1, 'weight_4': 0.3}. Best is trial 0 with value: 82.31514338082452.
[I 2024-06-01 21:22:28,898] Trial 2 finished with value: 82.33948912461707 and parameters: {'weight_0': 0.3, 'weight_1': 0.05, 'weight_2': 0.3, 'weight_3': 0.3, 'weight_4': 0.5}. Best is trial 0 with value: 82.31514338082452.
[I 2024-06-01 21:22:33,612] Trial 3 finished with value: 82.44876253561982 and parameters: {'weight_0': 0.15, 'weight_1': 0.5, 'weight_2': 0.05, 'weight_3': 0.15, 'weight_4': 0.05}. Best is trial 0 with value: 82.31514338082452.
[I 2024-06-01 21:22:38,648] Trial 4 finished with value: 82.32273234739239 and parameters: 

En iyi RMSE: 82.28848782455478
En iyi ağırlıklar: [0.55555556 0.05555556 0.05555556 0.05555556 0.27777778]


In [48]:
# import optuna
# import numpy as np
# from autogluon.tabular import TabularPredictor
# import os

# # Önceden eğitilmiş AutoGluon predictor
# automl = TabularPredictor.load(r'AutogluonModels\ag-20240601_144927')

# # Optimize edilecek modellerin listesi
# models = ['XGBoost_3_BAG_L1', 'RandomForest_2_BAG_L2', 'ExtraTrees_BAG_L2', 'CatBoost_3_BAG_L2', 'LightGBM_3_BAG_L2']

# # Veriyi hazırlama
# X_splits = [eval_split.drop(columns=['metastatic_diagnosis_period']),
#             # eval_split2.drop(columns=['metastatic_diagnosis_period']),
#             # eval_split3.drop(columns=['metastatic_diagnosis_period']),
#             # eval_split4.drop(columns=['metastatic_diagnosis_period']),
#             # eval_split5.drop(columns=['metastatic_diagnosis_period']),
#             # eval_split6.drop(columns=['metastatic_diagnosis_period']),
#             # eval_split7.drop(columns=['metastatic_diagnosis_period']),
#             # eval_split8.drop(columns=['metastatic_diagnosis_period']),
#             # eval_split9.drop(columns=['metastatic_diagnosis_period']),
#             # eval_split10.drop(columns=['metastatic_diagnosis_period'])
#             ]

# y_splits = [eval_split['metastatic_diagnosis_period'],
#             # eval_split2['metastatic_diagnosis_period'],
#             # eval_split3['metastatic_diagnosis_period'],
#             # eval_split4['metastatic_diagnosis_period'],
#             # eval_split5['metastatic_diagnosis_period'],
#             # eval_split6['metastatic_diagnosis_period'],
#             # eval_split7['metastatic_diagnosis_period'],
#             # eval_split8['metastatic_diagnosis_period'],
#             # eval_split9['metastatic_diagnosis_period'],
#             # eval_split10['metastatic_diagnosis_period']
#             ]

# # Optuna optimizasyonu
# def objective(trial):
#     # Ağırlıklar 0.05 ile 0.35 arasında olmalı
#     weights = [trial.suggest_float(f'weight_{i}', 0.05, 0.35) for i in range(len(models))]
#     weights = np.array(weights)
#     weights /= weights.sum()  # Ağırlıkları normalize ederek toplamının 1 olmasını sağlıyoruz

#     rmses = []
#     for X, y in zip(X_splits, y_splits):
#         # Seçilen modellerin holdout seti üzerindeki tahminlerini ağırlıklandırarak birleştiriyoruz
#         predictions = sum(weight * automl.predict(X, model=model) for model, weight in zip(models, weights))

#         # Gerçek değerlerle karşılaştırarak skoru hesaplıyoruz
#         rmse = np.sqrt(np.mean((predictions - y) ** 2))
#         rmses.append(rmse)

#     return np.mean(rmses)

# # Optuna çalıştırma
# study = optuna.create_study(direction='minimize')

# # İlk deneme için belirli ağırlıklarla parametreleri kuyruğa alıyoruz
# # initial_weights = [0.18769205, 0.12140385, 0.01334003, 0.11679509, 0.10542093, 0.08884338, 0.19295776, 0.00677248, 0.16677444]
# # study.enqueue_trial({f'weight_{i}': w for i, w in enumerate(initial_weights)})

# study.optimize(objective, n_trials=250)

# # En iyi deneme
# best_trial = study.best_trial

# # En iyi ağırlıklar
# best_weights = [best_trial.params[f'weight_{i}'] for i in range(len(models))]
# best_weights = np.array(best_weights)
# best_weights /= best_weights.sum()

# print(f"En iyi RMSE: {best_trial.value}")
# print(f"En iyi ağırlıklar: {best_weights}")

In [49]:
# import optuna
# import numpy as np
# from autogluon.tabular import TabularPredictor

# # Önceden eğitilmiş AutoGluon predictor'u yükleme
# model_path = os.path.join('AutogluonModels', 'ag-20240530_231812')
# automl = TabularPredictor.load(model_path)

# # Optimize edilecek modellerin listesi
# models = ['RandomForest_2_BAG_L2', 'CatBoost_3_BAG_L2', 'CatBoost_2_BAG_L2', 'ExtraTrees_BAG_L2',  'ExtraTrees_2_BAG_L2', 'XGBoost_2_BAG_L2', 'LightGBM_3_BAG_L1']

# # Veriyi hazırlama
# X = eval_split.drop(columns=['metastatic_diagnosis_period'])
# y = eval_split['metastatic_diagnosis_period']

# # Optuna optimizasyonu
# def objective(trial):
#     # Ağırlıklar 0.05 ile 0.3 arasında olmalı
#     weights = [trial.suggest_float(f'weight_{i}', 0.05, 0.35) for i in range(len(models))]
#     weights = np.array(weights)
#     weights /= weights.sum()  # Ağırlıkları normalize ederek toplamının 1 olmasını sağlıyoruz

#     # Seçilen modellerin holdout seti üzerindeki tahminlerini ağırlıklandırarak birleştiriyoruz
#     predictions = sum(weight * automl.predict(X, model=model) for model, weight in zip(models, weights))

#     # Gerçek değerlerle karşılaştırarak skoru hesaplıyoruz
#     rmse = np.sqrt(np.mean((predictions - y) ** 2))

#     return rmse

# # Optuna çalıştırma
# study = optuna.create_study(direction='minimize')

# # İlk deneme için belirli ağırlıklarla parametreleri kuyruğa alıyoruz
# # initial_weights = [0.29288348, 0.33154581, 0.10962312, 0.07420737, 0.09441869, 0.09732153]
# # study.enqueue_trial({f'weight_{i}': w for i, w in enumerate(initial_weights)})

# study.optimize(objective, n_trials=100)

# # En iyi deneme
# best_trial = study.best_trial

# # En iyi ağırlıklar
# best_weights = [best_trial.params[f'weight_{i}'] for i in range(len(models))]
# best_weights = np.array(best_weights)
# best_weights /= best_weights.sum()

# print(f"En iyi RMSE: {best_trial.value}")
# print(f"En iyi ağırlıklar: {best_weights}")