In [1]:
import numpy as np #🐷
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv
/kaggle/input/widsdatathon2024-challenge1/training.csv
/kaggle/input/widsdatathon2024-challenge1/test.csv
/kaggle/input/catboost/submission.csv
/kaggle/input/catboost/catboost_info/test_error.tsv
/kaggle/input/catboost/catboost_info/learn_error.tsv
/kaggle/input/catboost/catboost_info/catboost_training.json
/kaggle/input/catboost/catboost_info/time_left.tsv
/kaggle/input/catboost/catboost_info/learn/events.out.tfevents
/kaggle/input/catboost/catboost_info/test/events.out.tfevents
/kaggle/input/catboost2/submission.csv
/kaggle/input/catboost2/catboost_info/test_error.tsv
/kaggle/input/catboost2/catboost_info/learn_error.tsv
/kaggle/input/catboost2/catboost_info/catboost_training.json
/kaggle/input/catboost2/catboost_info/time_left.tsv
/kaggle/input/catboost2/catboost_info/learn/events.out.tfevents
/kaggle/input/catboost2/catboost_info/test/events.out.tfevents
/kaggle/input/v1-datat/submission.csv
/kaggle/input/v1-datat/pred

NN model/ submissions tweaking/ outliers/ Region & Division

In [2]:
from catboost import Pool, CatBoostClassifier as catboost
from xgboost import XGBClassifier as xgb
from lightgbm import LGBMClassifier as lgb, log_evaluation, early_stopping
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, IsolationForest
from sklearn.metrics import roc_auc_score, make_scorer, confusion_matrix
from category_encoders import CatBoostEncoder
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.neighbors import LocalOutlierFactor
import optuna
from sklearn.utils import resample

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Load Data

In [4]:
path = '/kaggle/input/widsdatathon2024-challenge1'
training = pd.read_csv(f"{path}/training.csv")
test = pd.read_csv(f"{path}/test.csv")
submission = pd.read_csv(f"{path}/sample_submission.csv")
submission_best = pd.read_csv("/kaggle/input/v1-datat/submission.csv")
prediction_best = pd.read_csv("/kaggle/input/v1-datat/prediction.csv")
catboost_df = pd.read_csv("/kaggle/input/catboost2/submission.csv")

<span style="color:crimson;">Columns by category</span> patients, zipinfo, air

In [5]:
pat_col = ['patient_id', 'patient_race', 'payer_type', 'patient_state', 'patient_zip3', 'patient_age', 'patient_gender', 'bmi','breast_cancer_diagnosis_code', 'breast_cancer_diagnosis_desc',
       'metastatic_cancer_diagnosis_code', 'metastatic_first_novel_treatment',
       'metastatic_first_novel_treatment_type', 'Region', 'Division']
zip_col = ['population', 'density', 'age_median', 'age_under_10', 'age_10_to_19',
       'age_20s', 'age_30s', 'age_40s', 'age_50s', 'age_60s', 'age_70s',
       'age_over_80', 'male', 'female', 'married', 'divorced', 'never_married',
       'widowed', 'family_size', 'family_dual_income',
       'income_household_median', 'income_household_under_5',
       'income_household_5_to_10', 'income_household_10_to_15',
       'income_household_15_to_20', 'income_household_20_to_25',
       'income_household_25_to_35', 'income_household_35_to_50',
       'income_household_50_to_75', 'income_household_75_to_100',
       'income_household_100_to_150', 'income_household_150_over',
       'income_household_six_figure', 'income_individual_median',
       'home_ownership', 'housing_units', 'home_value', 'rent_median',
       'rent_burden', 'education_less_highschool', 'education_highschool',
       'education_some_college', 'education_bachelors', 'education_graduate',
       'education_college_or_above', 'education_stem_degree',
       'labor_force_participation', 'unemployment_rate', 'self_employed',
       'farmer', 'race_white', 'race_black', 'race_asian', 'race_native',
       'race_pacific', 'race_other', 'race_multiple', 'hispanic', 'disabled',
       'poverty', 'limited_english', 'commute_time', 'health_uninsured',
       'veteran']
air_col = ['Ozone', 'PM25', 'N02']

Categorical data

In [6]:
cat_col = training.select_dtypes('object')

In [7]:
good_label_col = [col for col in cat_col if set(test[col]).issubset(set(training[col]))]
bad_label_col = set(cat_col.columns)-set(good_label_col)
bad_label_col

{'breast_cancer_diagnosis_code', 'breast_cancer_diagnosis_desc'}

In [8]:
set(test['breast_cancer_diagnosis_code'])-set(training['breast_cancer_diagnosis_code'])

{'C50921', 'C50922'}

Numerical data

In [9]:
num_col = training.select_dtypes(include = ['int64','float64'])

In [10]:
corr_matrix = num_col.corr().abs()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
reduced_corr_matrix = corr_matrix.mask(mask)
reduced_corr_matrix = reduced_corr_matrix.mask(reduced_corr_matrix<0.5, 0)
#pairwise
correlated_features = [c for c in reduced_corr_matrix.columns if any(reduced_corr_matrix[c] > 0.8)]
correlated_pair = []
for c in correlated_features:
    idx = reduced_corr_matrix[reduced_corr_matrix[c] > 0.8].index.values.tolist()
    value = reduced_corr_matrix.loc[idx, c]
    correlated_pair.append([(idx, c), value])
features_removed = ['population', 'male',
 'income_household_under_5',
 'income_household_5_to_10',
 'income_household_10_to_15',
 'income_household_15_to_20',
 'income_household_20_to_25',
 'income_household_25_to_35',
 'income_household_35_to_50',
 'income_household_100_to_150',
 'income_household_150_over']

**Identified discrepancies in diagnosis gender, zip codes

## 1. Preprocessing

**Variables**

In [11]:
#'catboost','xgboost','linear','linear_ens','rf','lgb'
modelused = ['xgboost']
predictions = []

mtraining = False


✏️✏️✏️
- full: (X_raw, target), test_raw
- reduced: (X, target), test

In [12]:
target = training['DiagPeriodL90D']
X_raw = training.drop('DiagPeriodL90D', axis = 1)
X = X_raw.drop(features_removed, axis = 1)
test_raw = test.copy()
test = test.drop(features_removed, axis = 1)

In [13]:
##catboost new
X_raw.drop(columns=['patient_id'],inplace=True)
test_raw.drop(columns=['patient_id'],inplace=True)

numerical_cols = X_raw.select_dtypes(exclude=['object']).columns
categorical_cols = X_raw.select_dtypes(include=['object']).columns
for col in categorical_cols:
    mode = X_raw[col].mode()[0]
    X_raw[col].fillna(mode, inplace=True)
    test_raw[col].fillna(mode, inplace=True)
for col in numerical_cols:
    mean = X_raw[col].median()
    X_raw[col].fillna(mean, inplace=True)
    test_raw[col].fillna(mean, inplace=True)
df = pd.concat([X_raw,test_raw])
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
# Loop through each categorical column
for col in categorical_cols.to_list()+['patient_zip3']:
    # Fit the encoder on the training data
    encoder.fit(df[[col]])

    # Transform both training and test data
    df[col] = encoder.transform(df[[col]])
    
cols = ['breast_cancer_diagnosis_code','metastatic_cancer_diagnosis_code','patient_zip3','patient_age','payer_type',
        'patient_state','breast_cancer_diagnosis_desc']

X_raw = df[:len(X_raw)]
test_raw = df[-len(test_raw):]

X_raw = X_raw[cols]

params = {
    
    'depth':2,
    'random_state': 42,
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'learning_rate':0.3,
    'iterations':157
}

model = catboost(**params)
    
trainX2, testX2, trainy2, testy2 = train_test_split(X_raw, target, test_size = 0.2, random_state = 40)
model.fit(X_raw, target)

# Tahminleri yap
# preds = model.predict_proba(testX2)[:, 1]
preds_test = model.predict_proba(test_raw[cols])[:, 1]
predictions.append(preds_test * 0)

0:	total: 55.9ms	remaining: 8.71s
1:	total: 57.5ms	remaining: 4.46s
2:	total: 59.1ms	remaining: 3.03s
3:	total: 60.6ms	remaining: 2.32s
4:	total: 62.1ms	remaining: 1.89s
5:	total: 63.5ms	remaining: 1.6s
6:	total: 64.9ms	remaining: 1.39s
7:	total: 66.3ms	remaining: 1.24s
8:	total: 67.8ms	remaining: 1.11s
9:	total: 69.2ms	remaining: 1.02s
10:	total: 70.6ms	remaining: 937ms
11:	total: 72ms	remaining: 870ms
12:	total: 73.4ms	remaining: 813ms
13:	total: 74.8ms	remaining: 764ms
14:	total: 76.3ms	remaining: 722ms
15:	total: 77.6ms	remaining: 684ms
16:	total: 79ms	remaining: 651ms
17:	total: 80.4ms	remaining: 621ms
18:	total: 81.8ms	remaining: 594ms
19:	total: 83.3ms	remaining: 571ms
20:	total: 84.8ms	remaining: 549ms
21:	total: 86.3ms	remaining: 529ms
22:	total: 87.8ms	remaining: 511ms
23:	total: 89.2ms	remaining: 495ms
24:	total: 91ms	remaining: 480ms
25:	total: 92.5ms	remaining: 466ms
26:	total: 93.9ms	remaining: 452ms
27:	total: 95.3ms	remaining: 439ms
28:	total: 96.7ms	remaining: 427ms
29

In [14]:
def pre_feature(df):
    df2 = df.copy()
    payer_type = 'COMMERCIAL'
    breast_cancer_diagnosis_dict = {'1744':'C50419', '1745':'C50519', '1749':'C50919', '1759':'C50421', '1741':'C50119', '1743':'C50319', '1742':'C50219', '1746':'C50619', '19881':'C7981', 
    'C50':'C50919', 'C509':'C50919', 'C5001':'C50019'}
    df2['breast_cancer_diagnosis_90D_converted'] = df2['breast_cancer_diagnosis_code'].replace(breast_cancer_diagnosis_dict)
    df2['uncovered'] = ((df2['payer_type']==payer_type) & (df2['health_uninsured']>=8)).astype(int)
    
    return df2

In [15]:
def fillbystate(df):
    df2 = df.copy()
    num_features = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
    c_impute = list(set(num_features) & (set(zip_col) | set(air_col)))
    df_fill = df.copy()
    df2[c_impute] = df_fill.groupby(by='patient_state', sort=False, group_keys=False)[c_impute].apply(lambda x: x.fillna(x.mean()))
    return df2

def filltest(df):
    df2 = df.copy()
    num_features = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
    c_impute = list(set(num_features) & (set(zip_col) | set(air_col)))
    all_df = pd.concat([X, df2], axis=0, ignore_index=True)
    all_df2 = all_df.copy()
    all_df[c_impute] = all_df2.groupby(by='patient_state', sort=False, group_keys=False)[c_impute].apply(lambda x: x.fillna(x.mean()))
    df2 = all_df.iloc[-len(df):]
    return df2

def fillCat(df):
    df2 = df.copy()
    df2 = df.fillna("NaN")
    return df2

def dropcolumn(df):
    df2 = df.copy()
    df2 = df2.drop(['bmi'], axis=1)
    return df2

# V3

In [16]:
def dropcolumn_cb(df):
    df2 = df.copy()
    df2 = df2.drop('bmi', axis=1)
    df2 = df2.drop('patient_id', axis=1)
    return df2

def dropcolumn2(df):
    df2 = df.copy()
    df2 = df2.drop(['patient_id'], axis=1)
    return df2

**Pipelines for CB**

In [17]:
cat_features = [col for col in X.columns if X[col].dtype in ['object']]
num_features = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

num_transformer = Pipeline([('imputer',SimpleImputer(strategy='mean')), ('scaler',StandardScaler())])
# cat_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', CatBoostEncoder())])
transform_cb = ColumnTransformer([('num', num_transformer, num_features)], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')
# transformer = ColumnTransformer([('imputer', SimpleImputer(strategy='most_frequent'), num_features), ('cat', cat_transformer, cat_features)], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

preprocess_cb = Pipeline([('fillCat', FunctionTransformer(fillCat)), ('dropcolumn', FunctionTransformer(dropcolumn))])
preprocess_cbv3 = Pipeline([('num', transform_cb), ('fillCat', FunctionTransformer(fillCat)), ('dropcolumn', FunctionTransformer(dropcolumn_cb))]) # 
# preprocessor = Pipeline([('transformer', transformer)]) #xgb
# preprocess_lgb = Pipeline([('transformer', transformer), ('dropcolumn2', FunctionTransformer(dropcolumn2))])

**Pipeline for XGB and LGB**

In [18]:
def preprocessing(X, y, test):
    cat_features = [col for col in X.columns if X[col].dtype in ['object']]
    num_features = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
    cat_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', CatBoostEncoder())])
    transformer = ColumnTransformer([('imputer', SimpleImputer(strategy='most_frequent'), num_features), ('cat', cat_transformer, cat_features)], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')
    #preprocessor = Pipeline([('fillna', FunctionTransformer(fillbystate)), ('transformer', transformer)])
    preprocessor = Pipeline([('transformer', transformer)])
    X2 = preprocessor.fit_transform(X, y)
    test2 = preprocessor.transform(test)
    return X2, test2

In [19]:
preprocess_cb

In [20]:
#preprocessor

## 2. Feature Engineering

In [21]:
def add_feature(df):
    df2 = df.copy()
#     df2['payer_poverty'] = df2['payer_type'] * df2['poverty']
    
    df2['BCcode_MCcode'] = df2['breast_cancer_diagnosis_code'] * df2['metastatic_cancer_diagnosis_code']
    df2['BCcode_Age'] = df2['breast_cancer_diagnosis_code'] * df2['patient_age']
    df2['BCcode_Disabled']  = df2['breast_cancer_diagnosis_code'] * df2['disabled']
    df2['BCcode_Rent'] = df2['breast_cancer_diagnosis_code'] * df2['rent_median']
    df2['BCcode_BCcode'] = df2['breast_cancer_diagnosis_code'] * df2['breast_cancer_diagnosis_code'] 
    df2['age30_BC_code'] = df2['age_30s'] * df2['breast_cancer_diagnosis_code'] 
#     df2['BCcode_IncomeUnder5'] = df2['breast_cancer_diagnosis_code'] * df2['income_household_under_5']
    df2['BCdesc_Age'] = df2['breast_cancer_diagnosis_desc'] * df2['patient_age'] 
    df2['BCcode_Poverty']  = df2['breast_cancer_diagnosis_code'] * df2['poverty']
    df2['BCcode_PM25']  = df2['breast_cancer_diagnosis_code'] * df2['PM25']
    
    df2['BC_code_MC_code_Age'] = df2['breast_cancer_diagnosis_code'] * df2['metastatic_cancer_diagnosis_code'] * df2['patient_age']
    df2['BC_code_MC_code_MC_code'] = df2['breast_cancer_diagnosis_code'] * df2['metastatic_cancer_diagnosis_code'] * df2['metastatic_cancer_diagnosis_code']
#     df2['BC_code_IncomeUnder5_MC_code'] = df2['breast_cancer_diagnosis_code'] * df2['income_household_under_5'] * df2['metastatic_cancer_diagnosis_code']
    df2['BCcode_Disabled_PatientAge']  = df2['breast_cancer_diagnosis_code'] * df2['disabled'] * df2['patient_age']
    df2['BCcode_Hispanic_Rent'] = df2['breast_cancer_diagnosis_code'] * df2['hispanic'] * df2['rent_median']
    df2['BCcode_State_Rent'] = df2['breast_cancer_diagnosis_code'] * df2['patient_state'] * df2['hispanic'] * df2['rent_median']
    df2['BCcode_Disabled_Rent']  = df2['breast_cancer_diagnosis_code'] * df2['disabled'] * df2['rent_median']
    df2['BCcode_BCcode_Disabled'] = df2['breast_cancer_diagnosis_code'] * df2['breast_cancer_diagnosis_code'] * df2['disabled']
    df2['BCdesc_BCdesc_BCdesc'] = df2['breast_cancer_diagnosis_desc'] * df2['breast_cancer_diagnosis_desc'] * df2['breast_cancer_diagnosis_desc'] #removed 0 importance
    df2['BCcode_BCdesc_Disabled'] = df2['breast_cancer_diagnosis_code'] * df2['breast_cancer_diagnosis_desc'] * df2['disabled']
    
    #Regional
#    df2['div_BCcode'] = df2['Division'] * df2['breast_cancer_diagnosis_code']
#    df2['div_never_married'] = df2['Division'] * df2['never_married']
#    df2['div_income'] = df2['Division'] * df2['income_individual_median']

    #health_uninsured
#     df2['uninsured_education'] = df2['health_uninsured'] * df2['education_highschool']
#     df2['BCcode_uninsured'] = df2['breast_cancer_diagnosis_code'] * df2['health_uninsured']
#     df2['uninsured_payer'] = df2['health_uninsured'] * df2['payer_type']
#     df2['uninsured_poverty'] = df2['health_uninsured'] * df2['poverty']
#     df2['uninsured_race'] = df2['health_uninsured'] * df2['patient_race']
    
#     #payer_type
#     df2['payer_age'] = df2['payer_type'] * df2['patient_age']
#     df2['payer_race'] = df2['payer_type'] * df2['patient_race']
#     df2['BCcode_payer'] = df2['payer_type'] * df2['breast_cancer_diagnosis_code']
#     df2['payer_region'] = df2['payer_type'] * df2['Region']
    return df2

- split: (trainX, trainy) (testX, testy), test_i

In [22]:
X_na = fillbystate(X)
test_na = filltest(test)


#test data for cb
X_cb = preprocess_cb.fit_transform(X_na, target)
test_cb = preprocess_cb.transform(test_na)
X_cb2 = preprocess_cbv3.fit_transform(X_na, target)
test_cb2 = preprocess_cbv3.transform(test_na)

#test data for xgb
X_xgb, test_xgb = preprocessing(X_na, target, test_na)
X_xgb2 = dropcolumn2(X_xgb)
test_xgb2 = dropcolumn2(test_xgb)
#test data for xgb with features prior to processing
Xf = pre_feature(X)
testf = pre_feature(test)
X_xgbf = fillbystate(Xf)
test_xgbf = filltest(testf)
X_xgbf, test_xgbf = preprocessing(X_xgbf, target, test_xgbf)
X_xgbf = dropcolumn2(X_xgbf)
test_xgbf = dropcolumn2(test_xgbf)

#test data for lgb
X_lgb = add_feature(X_xgb2)
test_lgb = add_feature(test_xgb2)

if mtraining:
    trainX, testX, trainy, testy = train_test_split(X, target, test_size = 0.2, random_state = 40)
    #if prefeature
    trainXf = pre_feature(trainX)
    testXf = pre_feature(testX)
    trainXf = fillbystate(trainXf)
    testXf = filltest(testXf)
    trainXf, testXf = preprocessing(trainXf, trainy, testXf)
    trainXf = dropcolumn(trainXf)
    testXf = dropcolumn(testXf)
    
    trainX_na = fillbystate(trainX)
    testX_na = filltest(testX)
    
    trainX_cb = preprocess_cb.fit_transform(trainX_na, trainy)
    testX_cb = preprocess_cb.transform(testX_na)
    
    trainX_xgb, testX_xgb = preprocessing(trainX_na, trainy, testX_na)

    trainX_cb2 = preprocess_cbv3.fit_transform(trainX_na, trainy)
    testX_cb2 = preprocess_cbv3.transform(testX_na)
    
    trainX_xgb2 = dropcolumn2(trainX_xgb)
    testX_xgb2 = dropcolumn2(testX_xgb)
    
    trainX_lgb = add_feature(trainX_xgb2)
    testX_lgb = add_feature(testX_xgb2)
##############################################################################################################################


In [23]:
# # Separate the majority and minority classes
# majority_class = X_lgb[target == 1]
# minority_class = X_lgb[target == 0]

# # Randomly undersample the majority class to match the minority class size
# majority_downsampled = resample(majority_class, replace=False, n_samples=len(minority_class), random_state=48)
# majority_downsampled2 = resample(majority_class, replace=False, n_samples=len(minority_class), random_state=40)
# balanced_df = pd.concat([majority_downsampled, minority_class])
# balanced_df2 = pd.concat([majority_downsampled2, minority_class])
# y = [1] * len(minority_class) + [0] * len(minority_class)
# balanced_y = pd.DataFrame(data = y)

# # Shuffle the rows in the balanced DataFrame to mix the classes
# balanced_df = balanced_df.sample(frac=1, random_state=40)
# balanced_df2 = balanced_df2.sample(frac=1, random_state=48)
# balanced_y2 = balanced_y.sample(frac=1, random_state=48)
# balanced_y = balanced_y.sample(frac=1, random_state=40)

Outliers

clf = LocalOutlierFactor(n_neighbors=20, contamination=0.01)
y_pred = clf.fit_predict(X_xgbf)
test_pred = clf.fit_predict(test_xgbf)
outliers = training[y_pred == -1]
test_outliers = prediction_best[test_pred == -1]
X_xgbf = X_xgbf[y_pred != -1]
target_xgbf = target[y_pred != -1]

## 3. Model Fitting

## CatBoost

The following modes for processing missing values are supported:

"Forbidden" — Missing values are not supported, their presence is interpreted as an error.
"Min" — Missing values are processed as the minimum value (less than all other values) for the feature. **It is guaranteed that a split that separates missing values from all other values is considered when selecting trees.**
"Max" — Missing values are processed as the maximum value (greater than all other values) for the feature. **It is guaranteed that a split that separates missing values from all other values is considered when selecting trees.**

- takes cat features and applies one-hot encoding
- catboost_classifier: (X_cb, target), test_cb

In [24]:
#CB
if 'catboost' in modelused:
    weight_cb = 0
    cat_idx = [n for n in range(len(X_cb.columns)) if X_cb.columns[n] in cat_features]
    if mtraining == True:
        catboost_classifier = catboost(iterations=1000, verbose=100, eval_metric='AUC', random_state=40)
        catboost_classifier.fit(trainX_cb, trainy, cat_features=cat_idx, eval_set=(testX_cb, testy), use_best_model=True, early_stopping_rounds=200)
    else:
        catboost_classifier = catboost(iterations=147, verbose=100, eval_metric='AUC', random_state=40) #285
        catboost_classifier.fit(X_cb, target, cat_features=cat_idx)
    catboost_classifier.set_probability_threshold(0.4)
    prediction = catboost_classifier.predict_proba(test_cb)[:,1]
    predictions.append(prediction * weight_cb)

In [25]:
#CBv3
if 'catboost' in modelused:
    weight_cb = 0
    cat_idx = [n for n in range(len(X_cb2.columns)) if X_cb2.columns[n] in cat_features]
    if mtraining == True:
        catboost_classifier = catboost(iterations=1000, verbose=100, eval_metric='AUC', random_state=40)
        catboost_classifier.fit(trainX_cb2, trainy, cat_features=cat_idx, eval_set=(testX_cb2, testy), use_best_model=True, early_stopping_rounds=200)
    else:
        catboost_classifier = catboost(iterations=285, verbose=100, eval_metric='AUC', random_state=40) #285
        catboost_classifier.fit(X_cb2, target, cat_features=cat_idx)
    catboost_classifier.set_probability_threshold(0.4)
    prediction = catboost_classifier.predict_proba(test_cb2)[:,1]
    predictions.append(prediction * weight_cb)

improvement to 0.806 after dropping bmi
0.807 scaling/ 0.808 drop patient_id
lb not consistent

Learning rate set to 0.056407

0:	test: 0.7652067	best: 0.7652067 (0)	total: 72.3ms	remaining: 1m 12s
100:	test: 0.8031992	best: 0.8031992 (100)	total: 3.65s	remaining: 32.5s
200:	test: 0.8096645	best: 0.8101783 (183)	total: 6.88s	remaining: 27.3s
300:	test: 0.8106381	best: 0.8112263 (284)	total: 10.1s	remaining: 23.5s
400:	test: 0.8104217	best: 0.8112263 (284)	total: 13.4s	remaining: 20s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.811226306
bestIteration = 284

Shrink model to first 285 iterations.

## XGBoost

- takes missing values
- 'object' -> 'category'

- converted codes

In [26]:
if 'xgboost' in modelused:
    weight_xgb = 0
    if mtraining:
        # best iteration 241
        # best iteration 240, 0.80816
        # best iteration 207, 0.80881
        xgb_classifier = xgb(n_estimators=800, eval_metric='auc', random_state=40, learning_rate=0.01, early_stopping_rounds=200)
        # xgb_classifier = xgb(n_estimators=100, random_state=40, reg_alpha=0.09, reg_lambda=0.09) #early_stopping_rounds=200
        xgb_classifier.fit(trainX_xgb, trainy, eval_set = [(testX_xgb, testy)])
        tn, fp, fn, tp = confusion_matrix(testy, xgb_classifier.predict(testX_xgb)).ravel()
        print(tn, fp, fn, tp)
    else:
        xgb_classifier = xgb(n_estimators=241, eval_metric='auc', random_state=40, learning_rate=0.01) #207 0.80881 #222 0.80999 v3 + 2 features learning rate 0.01
        xgb_classifier.fit(X_xgb, target)
    prediction2 = xgb_classifier.predict_proba(test_xgb)[:,1]
    predictions.append(prediction2 * weight_xgb)

In [27]:
if 'xgboost' in modelused:
    weight_xgb = 1
    if mtraining:
        # best iteration 241
        # best iteration 240, 0.80816
        # best iteration 207, 0.80881
        xgb_classifier = xgb(n_estimators=800, eval_metric='auc', random_state=40, learning_rate=0.01, early_stopping_rounds=200)
        # xgb_classifier = xgb(n_estimators=100, random_state=40, reg_alpha=0.09, reg_lambda=0.09) #early_stopping_rounds=200
        xgb_classifier.fit(trainX2, trainy2, eval_set = [(testX2, testy2)])
        tn, fp, fn, tp = confusion_matrix(testy2, xgb_classifier.predict(testX2)).ravel()
        print(tn, fp, fn, tp)
    else:
        xgb_classifier = xgb(n_estimators=317, eval_metric='auc', random_state=40, learning_rate=0.01) #207 0.80881 #222 0.80999 v3 + 2 features learning rate 0.01 #216 0.81443
        xgb_classifier.fit(X_raw, target)
    prediction2 = xgb_classifier.predict_proba(test_raw[cols])[:,1]
    predictions.append(prediction2 * weight_xgb)

parameters = {'n_estimators': [241], 'eval_metric': ['auc'], 'random_state': [40], 'learning_rate': [0.01], 'max_depth':[2,4,6,8]}

classifier = xgb()
grid = GridSearchCV(classifier, parameters)
grid.fit(X_xgb, target)

print(grid.best_params_)
print(grid.best_score_)

Confusion_matrix (prior pipeline):

n_estimators=407, false_negative = 51, false_positve = 421

## Random Forest

- rf_classifier: (X_xgb2, target), test_xgb2

In [28]:
if 'rf' in modelused:
    weight_rf = 0.1
    rf_classifier = RandomForestClassifier(random_state=40, oob_score=True, n_estimators=1000)
    rf_classifier.fit(X_xgbf, target)
    prediction_rf = rf_classifier.predict_proba(test_xgbf)[:,1]
    
    print(rf_classifier.oob_score_)
    predictions.append(prediction_rf * weight_rf)

X_xgb: 0.801022780102278 X_xgb2: 0.800092980009298 X_xgbf: 0.8016426468309313 X_lgb: 0.8004803967147064

## Linear

- vlf: (X_xgb2, target), test_xgb2

In [29]:
if 'linear' in modelused:
    vlf = linear_model.RidgeClassifier()
    if mtraining:        
        vlf.fit(trainX_xgb, trainy)
        train_res = vlf.predict(trainX_xgb)
        test_scr = vlf.predict(testX_xgb)
    else:
        #scaler = StandardScaler()
        #X_li = scaler.fit_transform(X_li)
        vlf.fit(X_xgb, target)
        #test_li = scaler.transform(test_li)
        train_res = vlf.predict(X_xgb)
    test_res = vlf.predict(test_xgb)

#     residual = target.sub(train_res)
#     residual = residual.mean()
#     test_res2 = test_res + residual
    #submission_ser = {"patient_id": submission['patient_id'], "DiagPeriodL90D": test_res2}
    #df_linear = pd.DataFrame(data=submission_ser)

## LGB

In [30]:
if 'lgb' in modelused:
    weight_lgb = 0
    lgb_classifier = lgb(n_estimators=61, learning_rate=0.08, boosting_type='gbdt', objective='binary') #is_unbalance=True
    if mtraining:
        lgb_classifier.fit(trainX_lgb, trainy, eval_set=[(testX_lgb, testy)], eval_metric='auc', callbacks=[log_evaluation(), early_stopping(100)])
        tn, fp, fn, tp = confusion_matrix(testy, lgb_classifier.predict(testX_lgb)).ravel()
        print(tn, fp, fn, tp)
    else:
        lgb_classifier.fit(X_lgb, target, eval_metric='auc')
    prediction_lgb = lgb_classifier.predict_proba(test_lgb)[:,1]
    predictions.append(prediction_lgb * weight_lgb)
    
# if 'lgb' in modelused:
#     weight_lgb = 0.1

#     lgb_classifier = lgb(n_estimators=1000, learning_rate=0.08, boosting_type='gbdt', objective='binary')
#     lgb_classifier.fit(balanced_df, balanced_y, eval_set=[(balanced_df2, balanced_y2)], eval_metric='auc', callbacks=[log_evaluation(), early_stopping(100)])

# #     lgb_classifier2 = lgb(n_estimators=1000, learning_rate=0.08, boosting_type='gbdt', objective='binary')
# #     lgb_classifier2.fit(balanced_df2, balanced_y2, eval_set=[(balanced_df, balanced_y)], eval_metric='auc', callbacks=[log_evaluation(), early_stopping(100)])

#     prediction_lgb = lgb_classifier.predict_proba(test_lgb)[:,1]


#     predictions.append(prediction_lgb * weight_lgb)

n_estimators=61 (false_negative = 51, false_positve = 422)

Early stopping, best iteration is:
[60]	valid_0's auc: 0.807346	valid_0's binary_logloss: 0.459549
with new features and dropping patient_id

## 3. Ensemble

In [31]:
prediction_final = sum(predictions) #+ catboost_df['DiagPeriodL90D'] * 0.3 #* 0.25 + submission_best['DiagPeriodL90D'] * 0.75

#prediction_xgbv3 = prediction2 * 0.1 + prediction * 0.2 + prediction_lgb * 0.7
submission_se = {"patient_id": submission['patient_id'], "DiagPeriodL90D": prediction_final}
df_final = pd.DataFrame(data=submission_se)

#indices = [74, 179, 301, 421, 440, 457, 521, 570, 572, 573, 670, 1007, 1023, 1045, 1326, 1338, 1364, 1531, 1586, 1683, 1756, 1843, 1909, 1969, 2042, 2045, 2133, 2266, 2324, 2525, 2612, 2733, 2861, 2921, 2973, 2984, 3011, 3012, 3115, 3134, 3260, 3587, 3795, 3855, 4064, 4235, 4511, 4753, 5171, 5177, 5215, 5422, 5458, 5459, 5559, 5561, 5582, 5758]
# df_trim = df_final.copy()
#df_final.loc[indices, 'DiagPeriodL90D'] = prediction_final[indices]
# df_trim.to_csv('submission.csv', index=None)

df_final.to_csv('submission.csv', index=None)

**prediction [xgb with features]**

## 4. Evaluation

Mean of Predictions

In [32]:
# print(prediction_proba.mean(), prediction2.mean(), prediction_rf.mean(), test_res.mean(), prediction_lgb.mean())

Feature Importances

In [33]:
# cat_features = list(zip(catboost_classifier.feature_names_, catboost_classifier.feature_importances_))
# cat_features.sort(key=lambda x: x[1], reverse=True)
# print(cat_features)
# lgb_features = list(zip(lgb_classifier.feature_name_, lgb_classifier.feature_importances_))
# lgb_features.sort(key=lambda x: x[1], reverse=True)
# lgb_features

[('BC_code_MC_code_Age', 90),
 ('breast_cancer_diagnosis_code', 79),
 ('payer_type', 70),
 ('metastatic_cancer_diagnosis_code', 58),
 ('patient_age', 55),
 ('metastatic_first_novel_treatment', 52),
 ('patient_gender', 51),
 ('BCcode_Age', 44),
 ('patient_race', 42),
 ('Region', 41),
 ('patient_state', 39),
 ('BC_code_MC_code_MC_code', 38),
 ('BCcode_Disabled_Rent', 37),
 ('BCcode_BCcode_Disabled', 36),
 ('BCcode_MCcode', 34),
 ('income_household_75_to_100', 33),
 ('BCcode_Disabled_PatientAge', 33),
 ('health_uninsured', 32),
 ('age30_BC_code', 31),
 ('Division', 30),
 ('BCcode_Poverty', 30),
 ('education_highschool', 24),

Tracking Submissions

In [34]:
submission_best['V73_newxgb'] = df_final["DiagPeriodL90D"]
submission_best.to_csv('submission_best.csv')
# prediction_best['compare'] = df_final["DiagPeriodL90D"] - prediction_best["DiagPeriodL90D"]

# prediction_best[cat_col.columns][prediction_best['compare']>0].describe()

In [35]:
submission_best.head()

Unnamed: 0,patient_id,DiagPeriodL90D,V73_newxgb
0,573710,0.808359,0.769741
1,593679,0.737129,0.803307
2,184532,0.761459,0.761241
3,447383,0.773402,0.79289
4,687972,0.80177,0.788245


In [36]:
# prediction_best[(prediction_best['compare']<0)&(prediction_best['Division']=='Pacific')].describe()

In [37]:
# prediction_best[prediction_best['Division']=='Pacific'].describe()

target lowers for Division=Pacific (lower individual income)

In [38]:
# df_final.loc[prediction_best['compare']<-0.02,'DiagPeriodL90D'] = df_final['DiagPeriodL90D'].min()


Region & Division

In [39]:
#df_final.quantile(0.25)

In [40]:
# test_outliers.index

Index([  74,  179,  301,  421,  440,  457,  521,  570,  572,  573,  670, 1007,
       1023, 1045, 1326, 1338, 1364, 1531, 1586, 1683, 1756, 1843, 1909, 1969,
       2042, 2045, 2133, 2266, 2324, 2525, 2612, 2733, 2861, 2921, 2973, 2984,
       3011, 3012, 3115, 3134, 3260, 3587, 3795, 3855, 4064, 4235, 4511, 4753,
       5171, 5177, 5215, 5422, 5458, 5459, 5559, 5561, 5582, 5758],
      dtype='int64')

In [41]:
# df_final.loc[test_outliers.index].describe()

In [42]:
# outliers[cat_col.columns].describe()

Mode for outliers: Pacific, Region (unchanged): South

In [43]:
# outliers.describe()

ensemble weight tuning

In [44]:
# pipeline_cb = make_pipeline(preprocess_cb, catboost(iterations=185, verbose=100, eval_metric='AUC', random_state=40, learning_rate=0.056, cat_features=cat_idx))
# pipeline_xgb = make_pipeline(preprocessor, xgb(n_estimators=241, eval_metric='auc', random_state=40, learning_rate=0.01))
#pipeline_linear


In [45]:
# # # Define the parameter search space
# def objective(trial):
#     STEP_SIZE = 0.10
#     weights = []
#     all_models_predictions = []
#     upper_limit = 1
#     weight_cb = trial.suggest_float('weight_cb', 0, upper_limit, step=STEP_SIZE)
#     weights.append(weight_cb)
#     upper_limit -= weight_cb
#     weight_xgb = 100 - sum(weights) # Adjust the range based on your expectations
#     weights.append(weight_xgb)
# #     weight_rf = 100 - sum(weights)
# #     weights.append(weight_rf)

#     ensemble_model = VotingClassifier(
#         estimators=[
#             ('cb', pipeline_cb),
#             ('xgb', pipeline_xgb)
# #            ('rf', rf_classifier)
#         ], voting='soft', weights=weights)

#     # Assuming 'trainX' and 'trainy' are your training data
#     cv = cross_val_score(ensemble_model, X, target, cv=skf, scoring='roc_auc').mean()

#     return cv

# # Assuming 'skf' is your StratifiedKFold object
# #skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=40)

# # Use the 'sampler' parameter for parallelization
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=2024))
# study.optimize(objective, n_trials=15)

# best_weights = study.best_params
# print("Best Weights for Ensemble:", best_weights)