In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc 
import tqdm
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 100)
pd.set_option('display.max_rows', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv(r'train.csv')
test = pd.read_csv(r'test.csv')
submission = pd.read_csv(r'solution_template.csv')

In [3]:
train = train.drop('patient_id', axis=1)
test = test.drop('patient_id', axis=1)

df = pd.concat([train, test], axis=0)

df.columns = df.columns.str.replace(' ', '_', regex=False)
df.columns = df.columns.str.replace('-', '_', regex=False)

In [4]:
df.drop(columns=['metastatic_first_novel_treatment_type', 'metastatic_first_novel_treatment'], inplace=True)

In [5]:
dolu_degerler_sozlugu = df.dropna(subset=['patient_state', 'Division', 'Region']).drop_duplicates('patient_zip3').set_index('patient_zip3')[['patient_state', 'Division', 'Region']].to_dict('index')

# Boş değerleri doldurun
for index, row in df.iterrows():
    if pd.isna(row['patient_state']) or pd.isna(row['Division']) or pd.isna(row['Region']):
        if row['patient_zip3'] in dolu_degerler_sozlugu:
            df.at[index, 'patient_state'] = dolu_degerler_sozlugu[row['patient_zip3']]['patient_state']
            df.at[index, 'Division'] = dolu_degerler_sozlugu[row['patient_zip3']]['Division']
            df.at[index, 'Region'] = dolu_degerler_sozlugu[row['patient_zip3']]['Region']

In [6]:
# Average ile başlayan sütunları bul
average_columns = [col for col in df.columns if col.startswith('Average')]

# Eksik değerleri patient_state ortalamaları ile doldur
for col in average_columns:
    df[col] = df.groupby('patient_state')[col].transform(lambda x: x.fillna(x.mean()))

In [7]:
# Male codes to female 
df['breast_cancer_diagnosis_code'] = df['breast_cancer_diagnosis_code'].replace({
    'C50122':'C50112', 'C50221':'C50211', 'C50421':'C50411', 'C509':'C5091', 'C50922':'C50912', 'C5021':'C50219'
})

# Fix bad zip 
df['patient_state'] = np.where(df['patient_zip3'] == 630, 'MO', np.where(df['patient_zip3'] == 864, 'AZ', df['patient_state']))

In [8]:
df['payer_type'] = df['payer_type'].fillna('Unknown')

In [9]:
df['bmi_missing'] = df['bmi'].isna().astype(int)

In [10]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_cols = df.select_dtypes(include=numerics).columns.drop('metastatic_diagnosis_period')

# 'metastatic_diagnosis_period' hariç numerik sütunlardaki eksik değerleri doldur
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

### PCA

In [11]:
age_columns = ['age_under_10', 'age_10_to_19', 'age_20s', 'age_30s', 'age_40s', 'age_50s', 'age_60s', 'age_70s', 'age_over_80']
income_columns = ['income_household_under_5','income_household_5_to_10','income_household_10_to_15','income_household_15_to_20','income_household_20_to_25',
               'income_household_25_to_35','income_household_35_to_50','income_household_50_to_75','income_household_75_to_100','income_household_100_to_150',
               'income_household_150_over']
education_columns = ['education_less_highschool', 'education_highschool', 'education_some_college', 'education_bachelors', 'education_graduate', 'education_college_or_above']
race_columns = ['race_white', 'race_black', 'race_asian', 'race_native', 'race_pacific', 'race_other', 'race_multiple', 'hispanic']
difficulties_columns = ['disabled', 'poverty', 'limited_english', 'commute_time', 'health_uninsured', 'veteran']
workforce_columns = ['labor_force_participation', 'unemployment_rate', 'self_employed', 'farmer']
marital_status = [ 'married', 'divorced', 'never_married', 'widowed']

to_pca_cols = age_columns + income_columns + education_columns + race_columns + difficulties_columns + workforce_columns + marital_status

pca_data = df[to_pca_cols]

if True:
    # Verileri ölçeklendirme
    scaler = StandardScaler()
    scaled_pca_data = scaler.fit_transform(pca_data)

    # PCA modelini oluşturma ve eğitme
    pca = PCA(n_components=2, random_state=47)  # 2 ana bileşen
    principal_components = pca.fit_transform(scaled_pca_data)

    # Ana bileşenleri DataFrame'e ekleme
    df['PCA_1'] = principal_components[:, 0]
    df['PCA_2'] = principal_components[:, 1]

    #Average cols PCA
    average_columns = [col for col in df.columns if col.startswith('Average')]

    average_data = df[average_columns]

    # Verileri ölçeklendirme
    scaler = StandardScaler()
    scaled_average_data = scaler.fit_transform(average_data)

    # PCA modelini oluşturma ve eğitme
    pca = PCA(n_components=2, random_state=77)  # 2 ana bileşen
    principal_components = pca.fit_transform(scaled_average_data)

    # Ana bileşenleri DataFrame'e ekleme
    df['PCA_AVG_1'] = principal_components[:, 0]
    df['PCA_AVG_2'] = principal_components[:, 1]

if False:
    pca_data = df[to_pca_cols]

    # Verileri ölçeklendirme
    scaler = StandardScaler()
    scaled_pca_data = scaler.fit_transform(pca_data)

    # t-SNE modelini oluşturma ve eğitme
    tsne = TSNE(n_components=2, learning_rate='auto', init='random', random_state=6)
    tsne_components = tsne.fit_transform(scaled_pca_data)

    # t-SNE bileşenlerini DataFrame'e ekleme
    df['TSNE_1'] = tsne_components[:, 0]
    df['TSNE_2'] = tsne_components[:, 1]

    #Average cols TSNE
    average_columns = [col for col in df.columns if col.startswith('Average')]

    average_data = df[average_columns]

    # Verileri ölçeklendirme
    scaler = StandardScaler()
    scaled_average_data = scaler.fit_transform(average_data)

    tsne = TSNE(n_components=2, learning_rate='auto', init='random', random_state=6)
    tsne_components = tsne.fit_transform(scaled_average_data)

    # Ana bileşenleri DataFrame'e ekleme
    df['TSNE_AVG_1'] = tsne_components[:, 0]
    df['TSNE_AVG_2'] = tsne_components[:, 1]

In [12]:
# En iyi random_state'i bulmak için değişken
best_random_state = None
best_explained_variance = 0

# Farklı random_state değerleri üzerinde deneme
for random_state in range(0, 100):
    pca = PCA(n_components=2, random_state=random_state)
    principal_components = pca.fit_transform(scaled_average_data)
    explained_variance = np.sum(pca.explained_variance_ratio_)
    
    if explained_variance > best_explained_variance:
        best_explained_variance = explained_variance
        best_random_state = random_state

print(f"En iyi random_state: {best_random_state} ile açıklanan varyans oranı: {best_explained_variance}")

En iyi random_state: 77 ile açıklanan varyans oranı: 0.9410977030599198


### breast_cancer_diagnosis_desc

In [13]:
def group_descriptions(desc):
    if 'unspecified' in desc or 'unsp site of' in desc:
        return 'unspecified'
    if 'upper-outer quadrant of' in desc:
        return 'upper-outer quadrant of'
    if 'central' in desc:
        return 'central'
    if 'ovrlp sites of' in desc or 'overlapping' in desc:
        return 'overlapping'
    if 'nipple and areola' in desc:
        return 'nipple and areola'
    if 'upper-inner' in desc:
        return 'upper-inner'
    if 'lower-outer' in desc:
        return 'lower-outer'
    if 'lower-inner' in desc:
        return 'lower-inner'
    if 'axillary tail' in desc:
        return 'axillary tail'
    return desc  
# Uygulama
df['breast_cancer_diagnosis_desc'] = df['breast_cancer_diagnosis_desc'].apply(group_descriptions)

In [14]:
df['age_group'] = pd.cut(df['patient_age'], right=False, bins=[0, 30, 40, 50, 60, 70, 80, 90, np.inf], labels=[0,1,2,3,4,5,6,7]).astype(int)
df['icd_9'] = df['breast_cancer_diagnosis_code'].str.startswith('17').astype(int)

In [15]:
# ICD-10 kodlarını ICD-9 kodlarına eşleştiren bir harita
icd10_to_icd9 = {
    'C509': '1749', 'C508': '1748', 'C504': '1744', 'C501': '1741', 'C505': '1745', 'C502': '1742', 
    'C503': '1743', 'C506': '1746', 'C7981': '19881', 'C500': '1740'
}

# Eşleştirme fonksiyonu
def map_icd10_to_icd9(code):
    # ICD-10 kodlarının ilk üç karakterini kullanarak eşleştirme yapıyoruz
    icd10_prefix = code[:4]
    if icd10_prefix in icd10_to_icd9:
        return icd10_to_icd9[icd10_prefix]
    else:
        return code

# Yeni sütunu oluşturma
df['icd9_feat'] = df['breast_cancer_diagnosis_code'].apply(map_icd10_to_icd9)

In [16]:
df.icd9_feat.unique()

array(['1749', '1744', '1748', '1742', '1740', '1741', '1743', '1745',
       '19881', '1746'], dtype=object)

In [17]:
df['patient_zip3'] = df['patient_zip3'].astype('category')

In [18]:
# 'metastatic_diagnosis_period' sütununda değerleri olanları 'train' DataFrame'ine ayır
train = df[df['metastatic_diagnosis_period'].notnull()]
train.reset_index(inplace=True, drop=True)

# 'metastatic_diagnosis_period' sütununda değerleri olmayanları 'test' DataFrame'ine ayır
test = df[df['metastatic_diagnosis_period'].isnull()]
test.drop('metastatic_diagnosis_period', axis=1, inplace=True)
test.reset_index(inplace=True, drop=True)

### Feature Selection

In [19]:
train = train[['icd_9', 'patient_age', 'breast_cancer_diagnosis_desc',
               'metastatic_cancer_diagnosis_code', 'patient_state', 'payer_type', 
               'PCA_AVG_2', 'Region', 'metastatic_diagnosis_period']]

test = test[['icd_9', 'patient_age', 'breast_cancer_diagnosis_desc',
             'metastatic_cancer_diagnosis_code', 'patient_state', 'payer_type',
             'PCA_AVG_2', 'Region']]

# 'PCA_1', 'PCA_2', 'PCA_AVG_1', 'PCA_AVG_2', 'Region', 'TSNE_1', 'TSNE_2', 'TSNE_AVG_1', 'TSNE_AVG_2', 'bmi_missing', 'age_group', 'icd_9'

In [20]:
%%time
from autogluon.tabular import TabularDataset, TabularPredictor

time_limit = 3600*10

automl = TabularPredictor(label='metastatic_diagnosis_period', problem_type='regression',
                          eval_metric='root_mean_squared_error'
                          )

automl.fit(train, presets='medium_quality', time_limit=time_limit, num_bag_folds=5, num_bag_sets=0, num_stack_levels=1, dynamic_stacking=True, 
            included_model_types=['XGB', 'CAT', 'XT', 'RF', 'GBM'], ag_args_fit={'num_gpus': 1, 'num_cpus': 8},
            hyperparameters = {
              'GBM': [
              # #     {'max_depth': 8, 'n_estimators': 100000, 'feature_fraction': 0.6},
              # #     {'max_depth': 7, 'n_estimators': 100000, 'feature_fraction': 0.6},
                  {'max_depth': 6, 'n_estimators': 100000},
                  {'max_depth': 5, 'n_estimators': 100000},
                  {'max_depth': 4, 'n_estimators': 100000},
                  {'max_depth': 3, 'n_estimators': 100000},
                  {'max_depth': 2, 'n_estimators': 100000},
              ],
              'XGB': [
                  # {'max_depth': 8, 'n_estimators': 100000, 'feature_fraction': 0.6},
                  # {'max_depth': 7, 'n_estimators': 100000, 'feature_fraction': 0.6},
                  {'max_depth': 6, 'n_estimators': 100000},
                  {'max_depth': 5, 'n_estimators': 100000},
                  {'max_depth': 4, 'n_estimators': 100000},
                  {'max_depth': 3, 'n_estimators': 100000},
                  {'max_depth': 2, 'n_estimators': 100000},
              ],
              'CAT': [
                  # {'depth': 8, 'iterations': 100000},
                  # {'depth': 7, 'iterations': 100000},
                  {'depth': 6, 'iterations': 100000},
                  {'depth': 5, 'iterations': 100000},
                  {'depth': 4, 'iterations': 100000},
                  {'depth': 3, 'iterations': 100000},
                  {'depth': 2, 'iterations': 100000},
              ],
              'XT': [
                  # {'max_depth': 8},
                  # {'max_depth': 7},
                  {'max_depth': 6},
                  {'max_depth': 5},
                  {'max_depth': 4},
                  {'max_depth': 3},
                  {'max_depth': 2},
              ],
              'RF': [
                  # {'max_depth': 8},
                  # {'max_depth': 7},
                  {'max_depth': 6},
                  {'max_depth': 5},
                  {'max_depth': 4},
                  {'max_depth': 3},
                  {'max_depth': 2},
              ],
            #   'KNN': [
            #       {'n_neighbors': 30},
            #       {'n_neighbors': 60},
            #       {'n_neighbors': 100},
            #   ],
            #   'LR': [
            #       {},  # Boş dictionary default değerleri kullanır
            #   ]
          },
            keep_only_best = False
          )

No path specified. Models will be saved in: "AutogluonModels\ag-20240603_094536"
Presets specified: ['medium_quality']
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 36000 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels\ag-20240603_094536\ds_sub_fit\sub_fit_ho.
Beginning AutoGluon training ... Time limit = 9000s
AutoGluon will save models to "AutogluonModels\ag-20240603_094536\ds_sub_fit\sub_fit_ho"
AutoGluon Version:  1.1.1b20240426
Python Version:     3.10.10
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:   

[1000]	valid_set's rmse: 81.173
[2000]	valid_set's rmse: 81.248


	Training S1F5 with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's rmse: 83.7032


	-81.5422	 = Validation score   (-root_mean_squared_error)
	8.32s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: RandomForest_BAG_L2 ... Training model for up to 8397.34s of the 8397.3s of remaining time.
	-81.4593	 = Validation score   (-root_mean_squared_error)
	4.72s	 = Training   runtime
	0.27s	 = Validation runtime
Fitting model: RandomForest_2_BAG_L2 ... Training model for up to 8392.24s of the 8392.21s of remaining time.
	-81.5675	 = Validation score   (-root_mean_squared_error)
	4.03s	 = Training   runtime
	0.28s	 = Validation runtime
Fitting model: RandomForest_3_BAG_L2 ... Training model for up to 8387.82s of the 8387.79s of remaining time.
	-81.6761	 = Validation score   (-root_mean_squared_error)
	3.28s	 = Training   runtime
	0.28s	 = Validation runtime
Fitting model: RandomForest_4_BAG_L2 ... Training model for up to 8384.11s of the 8384.08s of remaining time.
	-81.8844	 = Validation score   (-root_mean_squared_error)
	2.57s	 = Training   runtime
	0.28s	

[1000]	valid_set's rmse: 81.1866


	Training S1F2 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F3 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F4 with GPU, note that this may negatively impact model quality compared to CPU training.
	Training S1F5 with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's rmse: 80.2364


	-81.2314	 = Validation score   (-root_mean_squared_error)
	7.51s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: RandomForest_BAG_L2 ... Training model for up to 34310.71s of the 34310.68s of remaining time.
	-80.998	 = Validation score   (-root_mean_squared_error)
	5.72s	 = Training   runtime
	0.32s	 = Validation runtime
Fitting model: RandomForest_2_BAG_L2 ... Training model for up to 34304.55s of the 34304.52s of remaining time.
	-81.1266	 = Validation score   (-root_mean_squared_error)
	4.71s	 = Training   runtime
	0.32s	 = Validation runtime
Fitting model: RandomForest_3_BAG_L2 ... Training model for up to 34299.42s of the 34299.39s of remaining time.
	-81.2648	 = Validation score   (-root_mean_squared_error)
	3.84s	 = Training   runtime
	0.32s	 = Validation runtime
Fitting model: RandomForest_4_BAG_L2 ... Training model for up to 34295.13s of the 34295.1s of remaining time.
	-81.5873	 = Validation score   (-root_mean_squared_error)
	2.94s	 = Training   runtime


CPU times: total: 1h 16min 32s
Wall time: 35min 3s


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2a94a816f20>

In [26]:
automl.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-80.719,root_mean_squared_error,3.74,802.338,0.0,0.051,3,True,52
1,LightGBM_BAG_L2,-80.986,root_mean_squared_error,3.107,590.471,0.073,13.654,2,True,27
2,RandomForest_BAG_L2,-80.998,root_mean_squared_error,3.355,582.541,0.321,5.724,2,True,32
3,CatBoost_BAG_L2,-80.999,root_mean_squared_error,3.078,671.2,0.045,94.383,2,True,37
4,LightGBM_2_BAG_L2,-81.031,root_mean_squared_error,3.1,590.325,0.066,13.508,2,True,28
5,CatBoost_2_BAG_L2,-81.035,root_mean_squared_error,3.077,650.317,0.043,73.5,2,True,38
6,CatBoost_3_BAG_L2,-81.103,root_mean_squared_error,3.072,632.238,0.039,55.421,2,True,39
7,LightGBM_4_BAG_L2,-81.121,root_mean_squared_error,3.108,585.155,0.074,8.338,2,True,30
8,RandomForest_2_BAG_L2,-81.127,root_mean_squared_error,3.355,581.523,0.321,4.706,2,True,33
9,CatBoost_4_BAG_L2,-81.158,root_mean_squared_error,3.07,623.669,0.036,46.852,2,True,40


In [52]:
# Tahminleri bir liste içinde saklıyoruz
models = ['RandomForest_BAG_L2', 'CatBoost_BAG_L2']
weights = [0.5, 0.5]

# Tahminleri hesaplıyoruz ve ağırlıklandırıyoruz
predictions = sum(weight * automl.predict(test, model=model) for model, weight in zip(models, weights))

In [55]:
submission["metastatic_diagnosis_period"] = predictions

submission

Unnamed: 0,patient_id,metastatic_diagnosis_period
0,730681,220.000
1,334212,57.000
2,571362,204.000
3,907331,218.000
4,208382,45.000
...,...,...
5641,649011,43.000
5642,869024,74.000
5643,304800,230.000
5644,267406,67.000


In [56]:
submission.to_csv('submission.csv', index=False)