In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc 
import tqdm
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 100)
pd.set_option('display.max_rows', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import warnings
warnings.filterwarnings('ignore')

In [23]:
train = pd.read_csv(r'train.csv')
test = pd.read_csv(r'test.csv')
submission = pd.read_csv(r'solution_template.csv')

In [24]:
train = train.drop('patient_id', axis=1)
test = test.drop('patient_id', axis=1)

df = pd.concat([train, test], axis=0)

df.columns = df.columns.str.replace(' ', '_', regex=False)
df.columns = df.columns.str.replace('-', '_', regex=False)

In [25]:
df.drop(columns=['metastatic_first_novel_treatment_type', 'metastatic_first_novel_treatment'], inplace=True)

In [26]:
dolu_degerler_sozlugu = df.dropna(subset=['patient_state', 'Division', 'Region']).drop_duplicates('patient_zip3').set_index('patient_zip3')[['patient_state', 'Division', 'Region']].to_dict('index')

# Boş değerleri doldurun
for index, row in df.iterrows():
    if pd.isna(row['patient_state']) or pd.isna(row['Division']) or pd.isna(row['Region']):
        if row['patient_zip3'] in dolu_degerler_sozlugu:
            df.at[index, 'patient_state'] = dolu_degerler_sozlugu[row['patient_zip3']]['patient_state']
            df.at[index, 'Division'] = dolu_degerler_sozlugu[row['patient_zip3']]['Division']
            df.at[index, 'Region'] = dolu_degerler_sozlugu[row['patient_zip3']]['Region']

In [27]:
# Average ile başlayan sütunları bul
average_columns = [col for col in df.columns if col.startswith('Average')]

# Eksik değerleri patient_state ortalamaları ile doldur
for col in average_columns:
    df[col] = df.groupby('patient_state')[col].transform(lambda x: x.fillna(x.mean()))

In [28]:
# Male codes to female 
df['breast_cancer_diagnosis_code'] = df['breast_cancer_diagnosis_code'].replace({
    'C50122':'C50112', 'C50221':'C50211', 'C50421':'C50411', 'C509':'C5091', 'C50922':'C50912', 'C5021':'C50219'
})

# Fix bad zip 
df['patient_state'] = np.where(df['patient_zip3'] == 630, 'MO', np.where(df['patient_zip3'] == 864, 'AZ', df['patient_state']))

In [29]:
df['payer_type'] = df['payer_type'].fillna('Unknown')

In [30]:
df['bmi_missing'] = df['bmi'].isna().astype(int)

In [31]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_cols = df.select_dtypes(include=numerics).columns.drop('metastatic_diagnosis_period')

# 'metastatic_diagnosis_period' hariç numerik sütunlardaki eksik değerleri doldur
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

### PCA

In [32]:
age_columns = ['age_under_10', 'age_10_to_19', 'age_20s', 'age_30s', 'age_40s', 'age_50s', 'age_60s', 'age_70s', 'age_over_80']
income_columns = ['income_household_under_5','income_household_5_to_10','income_household_10_to_15','income_household_15_to_20','income_household_20_to_25',
               'income_household_25_to_35','income_household_35_to_50','income_household_50_to_75','income_household_75_to_100','income_household_100_to_150',
               'income_household_150_over']
education_columns = ['education_less_highschool', 'education_highschool', 'education_some_college', 'education_bachelors', 'education_graduate', 'education_college_or_above']
race_columns = ['race_white', 'race_black', 'race_asian', 'race_native', 'race_pacific', 'race_other', 'race_multiple', 'hispanic']
difficulties_columns = ['disabled', 'poverty', 'limited_english', 'commute_time', 'health_uninsured', 'veteran']
workforce_columns = ['labor_force_participation', 'unemployment_rate', 'self_employed', 'farmer']
marital_status = [ 'married', 'divorced', 'never_married', 'widowed']

to_pca_cols = age_columns + income_columns + education_columns + race_columns + difficulties_columns + workforce_columns + marital_status

pca_data = df[to_pca_cols]

if True:
    # Verileri ölçeklendirme
    scaler = StandardScaler()
    scaled_pca_data = scaler.fit_transform(pca_data)

    # PCA modelini oluşturma ve eğitme
    pca = PCA(n_components=2, random_state=47)  # 2 ana bileşen
    principal_components = pca.fit_transform(scaled_pca_data)

    # Ana bileşenleri DataFrame'e ekleme
    df['PCA_1'] = principal_components[:, 0]
    df['PCA_2'] = principal_components[:, 1]

    #Average cols PCA
    average_columns = [col for col in df.columns if col.startswith('Average')]

    average_data = df[average_columns]

    # Verileri ölçeklendirme
    scaler = StandardScaler()
    scaled_average_data = scaler.fit_transform(average_data)

    # PCA modelini oluşturma ve eğitme
    pca = PCA(n_components=2, random_state=77)  # 2 ana bileşen
    principal_components = pca.fit_transform(scaled_average_data)

    # Ana bileşenleri DataFrame'e ekleme
    df['PCA_AVG_1'] = principal_components[:, 0]
    df['PCA_AVG_2'] = principal_components[:, 1]

if False:
    pca_data = df[to_pca_cols]

    # Verileri ölçeklendirme
    scaler = StandardScaler()
    scaled_pca_data = scaler.fit_transform(pca_data)

    # t-SNE modelini oluşturma ve eğitme
    tsne = TSNE(n_components=2, learning_rate='auto', init='random', random_state=6)
    tsne_components = tsne.fit_transform(scaled_pca_data)

    # t-SNE bileşenlerini DataFrame'e ekleme
    df['TSNE_1'] = tsne_components[:, 0]
    df['TSNE_2'] = tsne_components[:, 1]

    #Average cols TSNE
    average_columns = [col for col in df.columns if col.startswith('Average')]

    average_data = df[average_columns]

    # Verileri ölçeklendirme
    scaler = StandardScaler()
    scaled_average_data = scaler.fit_transform(average_data)

    tsne = TSNE(n_components=2, learning_rate='auto', init='random', random_state=6)
    tsne_components = tsne.fit_transform(scaled_average_data)

    # Ana bileşenleri DataFrame'e ekleme
    df['TSNE_AVG_1'] = tsne_components[:, 0]
    df['TSNE_AVG_2'] = tsne_components[:, 1]

In [33]:
# En iyi random_state'i bulmak için değişken
best_random_state = None
best_explained_variance = 0

# Farklı random_state değerleri üzerinde deneme
for random_state in range(0, 100):
    pca = PCA(n_components=2, random_state=random_state)
    principal_components = pca.fit_transform(scaled_average_data)
    explained_variance = np.sum(pca.explained_variance_ratio_)
    
    if explained_variance > best_explained_variance:
        best_explained_variance = explained_variance
        best_random_state = random_state

print(f"En iyi random_state: {best_random_state} ile açıklanan varyans oranı: {best_explained_variance}")

En iyi random_state: 77 ile açıklanan varyans oranı: 0.9410977030599198


### breast_cancer_diagnosis_desc

In [34]:
def group_descriptions(desc):
    if 'unspecified' in desc or 'unsp site of' in desc:
        return 'unspecified'
    if 'upper-outer quadrant of' in desc:
        return 'upper-outer quadrant of'
    if 'central' in desc:
        return 'central'
    if 'ovrlp sites of' in desc or 'overlapping' in desc:
        return 'overlapping'
    if 'nipple and areola' in desc:
        return 'nipple and areola'
    if 'upper-inner' in desc:
        return 'upper-inner'
    if 'lower-outer' in desc:
        return 'lower-outer'
    if 'lower-inner' in desc:
        return 'lower-inner'
    if 'axillary tail' in desc:
        return 'axillary tail'
    return desc  
# Uygulama
df['breast_cancer_diagnosis_desc'] = df['breast_cancer_diagnosis_desc'].apply(group_descriptions)

In [35]:
df['age_group'] = pd.cut(df['patient_age'], right=False, bins=[0, 30, 40, 50, 60, 70, 80, 90, np.inf], labels=[0,1,2,3,4,5,6,7]).astype(int)
df['icd_9'] = df['breast_cancer_diagnosis_code'].str.startswith('17').astype(int)

In [36]:
# ICD-10 kodlarını ICD-9 kodlarına eşleştiren bir harita
icd10_to_icd9 = {
    'C509': '1749', 'C508': '1748', 'C504': '1744', 'C501': '1741', 'C505': '1745', 'C502': '1742', 
    'C503': '1743', 'C506': '1746', 'C7981': '19881', 'C500': '1740'
}

# Eşleştirme fonksiyonu
def map_icd10_to_icd9(code):
    # ICD-10 kodlarının ilk üç karakterini kullanarak eşleştirme yapıyoruz
    icd10_prefix = code[:4]
    if icd10_prefix in icd10_to_icd9:
        return icd10_to_icd9[icd10_prefix]
    else:
        return code

# Yeni sütunu oluşturma
df['icd9_feat'] = df['breast_cancer_diagnosis_code'].apply(map_icd10_to_icd9)

In [37]:
df.icd9_feat.unique()

array(['1749', '1744', '1748', '1742', '1740', '1741', '1743', '1745',
       '19881', '1746'], dtype=object)

In [38]:
# 'metastatic_diagnosis_period' sütununda değerleri olanları 'train' DataFrame'ine ayır
train = df[df['metastatic_diagnosis_period'].notnull()]
train.reset_index(inplace=True, drop=True)

# 'metastatic_diagnosis_period' sütununda değerleri olmayanları 'test' DataFrame'ine ayır
test = df[df['metastatic_diagnosis_period'].isnull()]
test.drop('metastatic_diagnosis_period', axis=1, inplace=True)
test.reset_index(inplace=True, drop=True)

### Selection

In [39]:
train = train[['icd_9', 'patient_age', 'breast_cancer_diagnosis_desc',
               'metastatic_cancer_diagnosis_code', 'patient_state', 'payer_type', 
               'PCA_AVG_2', 'Region', 'metastatic_diagnosis_period']]

test = test[['icd_9', 'patient_age', 'breast_cancer_diagnosis_desc',
             'metastatic_cancer_diagnosis_code', 'patient_state', 'payer_type',
             'PCA_AVG_2', 'Region']]

# 'PCA_1', 'PCA_2', 'PCA_AVG_1', 'PCA_AVG_2', 'Region', 'TSNE_1', 'TSNE_2', 'TSNE_AVG_1', 'TSNE_AVG_2', 'bmi_missing', 'age_group', 'icd_9'

### Holdouts for validation (Optional)

In [40]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=['metastatic_diagnosis_period'])
y = train['metastatic_diagnosis_period']

X_train, X_eval, y_train, y_eval = train_test_split(
    X, y, 
    test_size=0.1, # Değerlendirme seti oranı
    shuffle=True,
    stratify=y,     # Hedef değişkenine göre stratify
    random_state=6 # Sabit rastgelelik
)

# Eğitim ve değerlendirme setlerini birleştir
train_split = X_train.copy()
train_split['metastatic_diagnosis_period'] = y_train

eval_split = X_eval.copy()
eval_split['metastatic_diagnosis_period'] = y_eval

# # # holdout 2

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.11, # Değerlendirme seti oranı
#     shuffle=True,
#     stratify=y,     # Hedef değişkenine göre stratify
#     random_state=7 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split2 = X_eval.copy()
# eval_split2['metastatic_diagnosis_period'] = y_eval

# # # holdout 3

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.12, # Değerlendirme seti oranı
#     shuffle=True,
#     stratify=y,     # Hedef değişkenine göre stratify
#     random_state=61 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split3 = X_eval.copy()
# eval_split3['metastatic_diagnosis_period'] = y_eval

# # holdout4

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.13, # Değerlendirme seti oranı
#     shuffle=True,
#     stratify=y,     # Hedef değişkenine göre stratify
#     random_state=42 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split4 = X_eval.copy()
# eval_split4['metastatic_diagnosis_period'] = y_eval

# # holdout5

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.14, # Değerlendirme seti oranı
#     shuffle=True,
#     stratify=y,     # Hedef değişkenine göre stratify
#     random_state=77 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split5 = X_eval.copy()
# eval_split5['metastatic_diagnosis_period'] = y_eval

# # holdout6

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.0245, # Değerlendirme seti oranı
#     shuffle=True,
#     # stratify=y,     # Hedef değişkenine göre stratify
#     random_state=89 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split6 = X_eval.copy()
# eval_split6['metastatic_diagnosis_period'] = y_eval

# # holdout7

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.025, # Değerlendirme seti oranı
#     shuffle=True,
#     # stratify=y,     # Hedef değişkenine göre stratify
#     random_state=77 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split7 = X_eval.copy()
# eval_split7['metastatic_diagnosis_period'] = y_eval

# # holdout8

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.024, # Değerlendirme seti oranı
#     shuffle=True,
#     # stratify=y,     # Hedef değişkenine göre stratify
#     random_state=63 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split8 = X_eval.copy()
# eval_split8['metastatic_diagnosis_period'] = y_eval

# # holdout9

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.0255, # Değerlendirme seti oranı
#     shuffle=True,
#     # stratify=y,     # Hedef değişkenine göre stratify
#     random_state=38 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split9 = X_eval.copy()
# eval_split9['metastatic_diagnosis_period'] = y_eval

# # holdout10

# X = train_split.drop(columns=['metastatic_diagnosis_period'])
# y = train_split['metastatic_diagnosis_period']

# X_train, X_eval, y_train, y_eval = train_test_split(
#     X, y, 
#     test_size=0.026, # Değerlendirme seti oranı
#     shuffle=True,
#     # stratify=y,     # Hedef değişkenine göre stratify
#     random_state=9 # Sabit rastgelelik
# )

# # Eğitim ve değerlendirme setlerini birleştir
# train_split = X_train.copy()
# train_split['metastatic_diagnosis_period'] = y_train

# eval_split10 = X_eval.copy()
# eval_split10['metastatic_diagnosis_period'] = y_eval

In [41]:
train.shape, train_split.shape, eval_split.shape

((13173, 9), (11855, 9), (1318, 9))

In [54]:
%%time
from autogluon.tabular import TabularDataset, TabularPredictor

time_limit = 3600*20

automl = TabularPredictor(label='metastatic_diagnosis_period', problem_type='regression',
                          eval_metric='root_mean_squared_error'
                          )

automl.fit(train_split, presets='medium_quality', time_limit=time_limit, num_bag_folds=10, num_bag_sets=0, num_stack_levels=1, dynamic_stacking=False, 
            included_model_types=['XGB', 'CAT', 'GBM', 'XT', 'RF', 'LR'], ag_args_fit={'num_gpus': 1, 'num_cpus': 8},
            hyperparameters = {
              'GBM': [
                  {'max_depth': 4, 'n_estimators': 100000, 'feature_fraction': 0.75},
                  {'max_depth': 3, 'n_estimators': 100000, 'feature_fraction': 0.9},
                  {'max_depth': 2, 'n_estimators': 100000, 'feature_fraction': 1.0},
              ],
              'XGB': [
                  {'max_depth': 4, 'n_estimators': 100000, 'feature_fraction': 0.75},
                  {'max_depth': 3, 'n_estimators': 100000, 'feature_fraction': 0.9},
                  {'max_depth': 2, 'n_estimators': 100000, 'feature_fraction': 1.0},
              ],
              'CAT': [
                  {'depth': 4, 'iterations': 100000},
                  {'depth': 3, 'iterations': 100000},
                  {'depth': 2, 'iterations': 100000},
              ],
              'XT': [
                  {'max_depth': 6},
                  {'max_depth': 4},
              ],
              'RF': [
                  {'max_depth': 6},
                  {'max_depth': 4},
              ],
            #   'KNN': [
            #       {'n_neighbors': 30},
            #       {'n_neighbors': 60},
            #       {'n_neighbors': 100},
            #   ],
              # 'LR': [
              #     {},  # Boş dictionary default değerleri kullanır
              # ]
          },
            keep_only_best = False
          )

No path specified. Models will be saved in: "AutogluonModels\ag-20240601_170021"
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 72000s
AutoGluon will save models to "AutogluonModels\ag-20240601_170021"
AutoGluon Version:  1.1.1b20240426
Python Version:     3.10.10
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       5.14 GB / 15.42 GB (33.3%)
Disk Space Avail:   553.56 GB / 931.51 GB (59.4%)
Train Data Rows:    11855
Train Data Columns: 8
Label Column:       metastatic_diagnosis_period
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    5267.97 MB
	Train Data (Original)  Memory Usage: 3.87 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the 

[1000]	valid_set's rmse: 85.486
[2000]	valid_set's rmse: 85.3309


	Training S1F3 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F4 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F5 with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's rmse: 79.949


	Training S1F6 with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's rmse: 78.7115
[2000]	valid_set's rmse: 78.3399


	Training S1F7 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F8 with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's rmse: 80.3996
[2000]	valid_set's rmse: 80.3487


	Training S1F9 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F10 with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's rmse: 78.8657
[2000]	valid_set's rmse: 78.2255
[3000]	valid_set's rmse: 78.1807


	-80.9759	 = Validation score   (-root_mean_squared_error)
	44.82s	 = Training   runtime
	0.24s	 = Validation runtime
Fitting model: LightGBM_2_BAG_L2 ... Training model for up to 71511.32s of the 71511.31s of remaining time.
	Fitting 10 child models (S1F1 - S1F10) | Fitting with SequentialLocalFoldFittingStrategy
	Training S1F1 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F2 with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's rmse: 85.2153
[2000]	valid_set's rmse: 84.8153
[3000]	valid_set's rmse: 84.6701
[4000]	valid_set's rmse: 84.7011


	Training S1F3 with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's rmse: 83.0808
[2000]	valid_set's rmse: 82.5488
[3000]	valid_set's rmse: 82.222
[4000]	valid_set's rmse: 81.8587
[5000]	valid_set's rmse: 81.6465
[6000]	valid_set's rmse: 81.4765
[7000]	valid_set's rmse: 81.6301


	Training S1F4 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F5 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F6 with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's rmse: 78.884
[2000]	valid_set's rmse: 78.3979


	Training S1F7 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F8 with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's rmse: 79.9886


	Training S1F9 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F10 with GPU, note that this may negatively impact model quality compared to CPU training.
	-80.9119	 = Validation score   (-root_mean_squared_error)
	49.44s	 = Training   runtime
	0.37s	 = Validation runtime
Fitting model: LightGBM_3_BAG_L2 ... Training model for up to 71460.91s of the 71460.89s of remaining time.
	Fitting 10 child models (S1F1 - S1F10) | Fitting with SequentialLocalFoldFittingStrategy
	Training S1F1 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F2 with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's rmse: 85.6737
[2000]	valid_set's rmse: 85.2487
[3000]	valid_set's rmse: 84.938
[4000]	valid_set's rmse: 84.5564
[5000]	valid_set's rmse: 84.4984
[6000]	valid_set's rmse: 84.553


	Training S1F3 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F4 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F5 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F6 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F7 with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's rmse: 78.7981


	Training S1F8 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F9 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F10 with GPU, note that this may negatively impact model quality compared to CPU training.
	-81.3049	 = Validation score   (-root_mean_squared_error)
	18.29s	 = Training   runtime
	0.26s	 = Validation runtime
Fitting model: RandomForest_BAG_L2 ... Training model for up to 71442.02s of the 71442.0s of remaining time.
	-81.3542	 = Validation score   (-root_mean_squared_error)
	2.9s	 = Training   runtime
	0.25s	 = Validation runtime
Fitting model: RandomForest_2_BAG_L2 ... Training model for up to 71438.77s of the 71438.75s of remaining time.
	-81.4074	 = Validation score   (-root_mean_squared_error)
	2.03s	 = Training   runtime
	0.25s	 = Validation runtime
Fitting model: CatBoost_BAG_L2 ... Training model for up to 71436.39s of the 71436.38s of remaining time.
	Fitting 10 chi

CPU times: total: 36min 39s
Wall time: 12min 58s


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x1d0ca5f8520>

In [55]:
automl.leaderboard(eval_split)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost_3_BAG_L1,-82.026,-81.82,root_mean_squared_error,0.417,0.16,29.239,0.417,0.16,29.239,1,True,13
1,RandomForest_2_BAG_L2,-82.04,-81.407,root_mean_squared_error,2.784,2.082,441.696,0.091,0.246,2.029,2,True,19
2,XGBoost_2_BAG_L1,-82.119,-81.762,root_mean_squared_error,0.379,0.13,22.237,0.379,0.13,22.237,1,True,12
3,RandomForest_BAG_L2,-82.163,-81.354,root_mean_squared_error,2.8,2.085,442.572,0.107,0.249,2.905,2,True,18
4,ExtraTrees_BAG_L2,-82.192,-81.411,root_mean_squared_error,2.793,2.08,440.331,0.1,0.244,0.664,2,True,23
5,CatBoost_3_BAG_L2,-82.21,-81.382,root_mean_squared_error,2.814,1.891,487.201,0.121,0.055,47.533,2,True,22
6,ExtraTrees_2_BAG_L2,-82.21,-81.485,root_mean_squared_error,2.788,2.067,440.351,0.095,0.231,0.684,2,True,24
7,CatBoost_2_BAG_L2,-82.217,-81.369,root_mean_squared_error,2.814,1.893,486.844,0.121,0.057,47.176,2,True,21
8,LightGBM_3_BAG_L2,-82.244,-81.305,root_mean_squared_error,2.977,2.098,457.958,0.284,0.262,18.29,2,True,17
9,WeightedEnsemble_L2,-82.248,-81.519,root_mean_squared_error,1.242,0.65,191.797,0.011,0.0,0.033,2,True,14


In [44]:
# eval_split_df = automl.leaderboard(eval_split)
# eval_split2_df = automl.leaderboard(eval_split2)
# eval_split3_df = automl.leaderboard(eval_split3)
# eval_split4_df = automl.leaderboard(eval_split4)
# eval_split5_df = automl.leaderboard(eval_split5)
# # eval_split6_df = automl.leaderboard(eval_split6)
# # eval_split7_df = automl.leaderboard(eval_split7)
# # eval_split8_df = automl.leaderboard(eval_split8)
# # eval_split9_df = automl.leaderboard(eval_split9)
# # eval_split10_df = automl.leaderboard(eval_split10)

In [45]:
# # Combine the leaderboards into a list
# leaderboards = [eval_split_df, eval_split2_df, eval_split3_df, eval_split4_df, eval_split5_df]
# # , eval_split4_df, eval_split5_df, eval_split6_df, eval_split7_df, eval_split8_df, eval_split9_df, eval_split10_df

# # Calculate the ranks for each model in each leaderboard
# for i, df in enumerate(leaderboards):
#     # df = df.sort_values('score_val', ascending=False).reset_index(drop=True)
#     df['rank'] = df.index + 1
#     leaderboards[i] = df

# # Combine all the ranks into a single DataFrame
# ranks_df = pd.concat([df[['model', 'rank']] for df in leaderboards])

# # Calculate the average rank (index) for each model
# average_ranks = ranks_df.groupby('model')['rank'].mean().reset_index()
# average_ranks = average_ranks.rename(columns={'rank': 'average_index'})

# # Sort the models by their average rank
# average_ranks = average_ranks.sort_values('average_index').reset_index(drop=True)

# average_ranks

In [46]:
automl.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-80.729,root_mean_squared_error,4.23,1046.031,0.001,0.05,3,True,28
1,XGBoost_BAG_L2,-81.077,root_mean_squared_error,3.034,842.454,0.244,44.169,2,True,25
2,LightGBM_BAG_L2,-81.1,root_mean_squared_error,3.056,855.012,0.266,56.727,2,True,15
3,LightGBM_2_BAG_L2,-81.11,root_mean_squared_error,3.065,841.401,0.275,43.115,2,True,16
4,XGBoost_2_BAG_L2,-81.18,root_mean_squared_error,3.055,845.674,0.265,47.388,2,True,26
5,LightGBM_3_BAG_L2,-81.205,root_mean_squared_error,3.179,854.582,0.389,56.297,2,True,17
6,XGBoost_3_BAG_L2,-81.276,root_mean_squared_error,3.113,863.302,0.323,65.017,2,True,27
7,CatBoost_BAG_L2,-81.318,root_mean_squared_error,2.895,918.58,0.105,120.295,2,True,20
8,RandomForest_BAG_L2,-81.334,root_mean_squared_error,3.155,801.449,0.365,3.164,2,True,18
9,CatBoost_2_BAG_L2,-81.345,root_mean_squared_error,2.893,896.283,0.103,97.997,2,True,21


In [47]:
# CatBoost_2_BAG_L2
# WeightedEnsemble_L2
# XGBoost_3_BAG_L2
# ExtraTrees_2_BAG_L2
# RandomForest_2_BAG_L2
# LightGBM_3_BAG_L1

In [48]:
# Tahminleri bir liste içinde saklıyoruz
models = ['CatBoost_2_BAG_L2', 'ExtraTrees_2_BAG_L2']
weights = [0.6, 0.4]

# Tahminleri hesaplıyoruz ve ağırlıklandırıyoruz
predictions = sum(weight * automl.predict(test, model=model) for model, weight in zip(models, weights))

In [49]:
submission["metastatic_diagnosis_period"] = predictions

submission

Unnamed: 0,patient_id,metastatic_diagnosis_period
0,730681,216.798
1,334212,58.840
2,571362,196.725
3,907331,203.535
4,208382,41.914
...,...,...
5641,649011,40.681
5642,869024,76.532
5643,304800,223.824
5644,267406,64.749


In [50]:
submission.to_csv('2model.csv', index=False)

In [51]:
# feature_importances = automl.feature_importance(data=train)

In [52]:
# feature_importances