# __Team Codeaholics__ –  WIDS Jupyter Notebook #

## __HEADER FILES__  ##

In [None]:
import joblib
import numpy as np
import pandas as pd
from skimpy import skim
import category_encoders as ce
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder, RobustScaler
from catboost import CatBoostRegressor, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, ExtraTreesClassifier, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor

#### _Setting up the environment_ ####

In [None]:

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)
seed = 42

#### _Citations and Credits_ ####


Code has been adapted from these sources: 
- [simonagradinaru] (https://www.kaggle.com/code/simonagradinaru/wids-24-2-embracing-diversity-robustness)
- [ogwalakello] (https://www.kaggle.com/code/ogwalakello/wids-datathon-2024-1st-place-solution)
- [sid4ds] (https://www.kaggle.com/code/sid4ds/wids-2-00-overview-cv-setup)

## __LOADING THE DATA__  ##

In [None]:
training = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')
solution_template = pd.read_csv('solution_template.csv')

print("Train, Test, and  dataset loaded successfully.")

### __Train Dataset__ ###

In [None]:
training.head()

### __Test Dataset__ ###

In [None]:
test.head()

### __Solution Template__ ###

In [None]:
solution_template.head()

### _Statistics of the raw data_ ###

In [None]:
skim(training)

## __DATA PREPROCESSING__  ##

### _Removing unreliable columns_ ###

In [None]:

columns_to_remove = ['patient_gender', 'breast_cancer_diagnosis_desc']

training = training.drop(columns_to_remove, axis=1)
test = test.drop(columns_to_remove, axis=1)


### _Removing unreliable rows_ ###

In [None]:
training = training[training['family_size'].isna() != 1]

### _Correcting outliers and errors_ ###

In [None]:
# Fix bad zip 
training['patient_state'] = np.where(training['patient_zip3'] == 630, 'MO', np.where(training['patient_zip3'] == 864, 'AZ', training['patient_state']))

# Male codes to female 
training['breast_cancer_diagnosis_code'] = training['breast_cancer_diagnosis_code'].replace({
    'C50122':'C50112', 'C50221':'C50211', 'C50421':'C50411', 'C509':'C5091', 'C50922':'C50912'
})

# Recode categories in test data 
test['breast_cancer_diagnosis_code'] = test['breast_cancer_diagnosis_code'].replace({'C5021':'C50219'})

# Population columns
pop_cols = training.loc[:, 'population':'veteran'].columns.to_list()

# Fix outliers 
training.loc[training.patient_id == 441322, pop_cols] = training.loc[training.patient_id == 982003, pop_cols].values
training.loc[training.patient_id == 271422, pop_cols] = training.loc[training.patient_id == 271245, pop_cols].values
training.loc[training.patient_id == 714510, pop_cols] = training.loc[training.patient_id == 636245, pop_cols].values


### _Filling Nan values in Categorical Data_ ###

In [None]:
# Imputing Payer Type
training['payer_type'] = training['payer_type'].fillna('None')
test['payer_type'] = test['payer_type'].fillna('None')

# Imputing Patient Race
training['patient_race'] = training['patient_race'].fillna('Not Recorded')
test['patient_race'] = test['patient_race'].fillna('Not Recorded')

# Imputing Metastatic Novel Treatment
training['metastatic_first_novel_treatment'] = training['metastatic_first_novel_treatment'].fillna('Not Recorded')
test['metastatic_first_novel_treatment'] = test['metastatic_first_novel_treatment'].fillna('Not Recorded')

# Imputing Metastatic Novel Treatment Type
training['metastatic_first_novel_treatment_type'] = training['metastatic_first_novel_treatment_type'].fillna('Not Recorded')
test['metastatic_first_novel_treatment_type'] = test['metastatic_first_novel_treatment_type'].fillna('Not Recorded')

### _Custom encoding of the Division and Age attributes_ ###

In [None]:
# Encoding based on the Graph
custom_mapping1 = {
    'East South Central': 1,
    'Middle Atlantic': 7,
    'South Atlantic': 3,
    'East North Central': 4,
    'West South Central': 5,
    'West North Central': 6,
    'Pacific' : 2,
    'Mountain':8
}

training['div_encoded'] = training['Division'].map(custom_mapping1)
test['div_encoded'] = test['Division'].map(custom_mapping1)


# Define the ranges and corresponding labels
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]  # Define the edges of the bins
labels = [1, 2, 3, 4, 5, 6, 7, 0, -1, -2]      # Define the labels for each bin

training['age_encoded'] = pd.cut(training['patient_age'], bins=bins, labels=labels, right=False)
test['age_encoded'] = pd.cut(test['patient_age'], bins=bins, labels=labels, right=False)

training['age_encoded'] = training['age_encoded'].astype(int)
test['age_encoded'] = test['age_encoded'].astype(int)


### _Standard Imputation Function_ ###
- Mean for numerical data
- Mode for the remaning categorical data

In [None]:
# Replace missing values
def mixed_imputation(df, group_col):
    for column in df.columns:
        if column != group_col:  # Exclude the group column
            # If the column is numerical, then mean imputation
            if df[column].dtype in [np.dtype('float_'), np.dtype('int_')]:  
                mean_impute = df.groupby(group_col)[column].mean()
                df[column] = df[column].fillna(df[group_col].map(mean_impute))
            # If the column is categorical, apply mode imputation
            else :  
                mode_impute = df.groupby(group_col)[column].apply(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
                df[column] = df[column].fillna(df[group_col].map(mode_impute))
            
    return df

### _Imputing the population columns_ ###

In [None]:
# Subset population data 
training_pop = training[['patient_zip3', 'patient_state'] + pop_cols].drop_duplicates().sort_values(by='patient_zip3').reset_index(drop=True)

# Impute missing values 
training_pop = mixed_imputation(df=training_pop, group_col='patient_zip3')
training_pop = mixed_imputation(df=training_pop, group_col='patient_state')

print(training_pop.shape)
training_pop.head()


### _Imputing the temperature columns_ ###

In [None]:

# Subset temperatures 
avg_cols = training.columns[training.columns.str.startswith('Average')].tolist()
training_avg = training[['patient_zip3', 'patient_state'] + avg_cols].drop_duplicates().sort_values(by='patient_zip3').reset_index(drop=True)

print(training_avg.shape)
training_avg.head()

# Melt data
training_avg_melt = pd.melt(training_avg, id_vars=['patient_zip3', 'patient_state'])

# Extract month and convert it to datetime
training_avg_melt['month'] = training_avg_melt['variable'].apply(lambda x: x[len(x)-6:])
training_avg_melt['month'] = pd.to_datetime(training_avg_melt['month'], format='%b-%y')

# # Create growth from prior month
training_avg_melt.sort_values(by=['patient_zip3', 'patient_state', 'month'], inplace=True)

# Fill missingness - forward, then backwards for remaining 
training_avg_melt['value'] = training_avg_melt.groupby(['patient_zip3', 'patient_state'])['value'].ffill()
training_avg_melt['value'] = training_avg_melt.groupby(['patient_zip3', 'patient_state'])['value'].bfill()
training_avg_melt.head()


### _Adding new features derived from the temperature subset_ ###

In [None]:

# Reshape data 
training_avgs = training_avg_melt.drop('month', axis=1).pivot(index=['patient_zip3', 'patient_state'],columns='variable', values='value').reset_index()[['patient_zip3', 'patient_state'] + avg_cols]

# More features 
training_avgs['Avg-13'] = training_avgs.loc[:, 'Average of Jan-13':'Average of Dec-13'].mean(axis=1)
training_avgs['Avg-14'] = training_avgs.loc[:, 'Average of Jan-14':'Average of Dec-14'].mean(axis=1)
training_avgs['Avg-15'] = training_avgs.loc[:, 'Average of Jan-15':'Average of Dec-15'].mean(axis=1)
training_avgs['Avg-16'] = training_avgs.loc[:, 'Average of Jan-16':'Average of Dec-16'].mean(axis=1)
training_avgs['Avg-17'] = training_avgs.loc[:, 'Average of Jan-17':'Average of Dec-17'].mean(axis=1)
training_avgs['Avg-18'] = training_avgs.loc[:, 'Average of Jan-18':'Average of Dec-18'].mean(axis=1)

print(training_avgs.shape)
training_avgs.head()


### _Combine the subset datasets_ ###

In [None]:

training_full = training.drop(pop_cols + avg_cols, axis=1).merge(
    training_pop, how='left', on=['patient_zip3', 'patient_state']
).merge(
    training_avgs, how='left', on=['patient_zip3', 'patient_state']
)

### _Extracting new features_ ###

In [None]:

# Categorize variables
training_full['age_group'] = pd.cut(training_full['patient_age'], right=False, bins=[0, 30, 40, 50, 60, 70, 80, 90, np.inf], labels=[0,1,2,3,4,5,6,7]).astype(int)
training_full['icd_9'] = training_full['breast_cancer_diagnosis_code'].str.startswith('17').astype(int)

# Include bmi info 
training_full['bmi_missing'] = training_full['bmi'].isna().astype(int)
training_full['bmi_recoded'] = np.where(training_full['bmi'].isna(), 0, 
                                  np.where(training_full['bmi'] < 18.5, 1, 
                                          np.where(training_full['bmi'] < 25, 2, 
                                                 np.where(training_full['bmi'] < 30, 3, 4))))
training_full.columns = training_full.columns.str.replace(' ', '_').str.replace('-', '')


print(training_full.shape)
training_full.head()


### _Same techniques are applied on the test dataset_ ###

In [None]:

# Fix bad zip 
test['patient_state'] = np.where(test['patient_zip3'] == 630, 'MO', 
                                    np.where(test['patient_zip3'] == 864, 'AZ', test['patient_state']))

# Melt data
df_avg_melt_test = pd.melt(test[['patient_zip3', 'patient_state'] + avg_cols].drop_duplicates().sort_values(by='patient_zip3').reset_index(drop=True), id_vars=['patient_zip3', 'patient_state'])

# Extract month and convert it to datetime
df_avg_melt_test['month'] = df_avg_melt_test['variable'].apply(lambda x: x[len(x)-6:])
df_avg_melt_test['month'] = pd.to_datetime(df_avg_melt_test['month'], format='%b-%y')
df_avg_melt_test.sort_values(by=['patient_zip3', 'patient_state', 'month'], inplace=True)

# Fill missingness - forward, then backwards for remaining 
df_avg_melt_test['value'] = df_avg_melt_test.groupby(['patient_zip3', 'patient_state'])['value'].ffill()
df_avg_melt_test['value'] = df_avg_melt_test.groupby(['patient_zip3', 'patient_state'])['value'].bfill()

# Reshape data 
df_avgs_test = df_avg_melt_test.drop('month', axis=1).pivot(index=['patient_zip3', 'patient_state'],columns='variable', values='value').reset_index()[['patient_zip3', 'patient_state'] + avg_cols]

# More features 
df_avgs_test['Avg-13'] = df_avgs_test.loc[:, 'Average of Jan-13':'Average of Dec-13'].mean(axis=1)
df_avgs_test['Avg-14'] = df_avgs_test.loc[:, 'Average of Jan-14':'Average of Dec-14'].mean(axis=1)
df_avgs_test['Avg-15'] = df_avgs_test.loc[:, 'Average of Jan-15':'Average of Dec-15'].mean(axis=1)
df_avgs_test['Avg-16'] = df_avgs_test.loc[:, 'Average of Jan-16':'Average of Dec-16'].mean(axis=1)
df_avgs_test['Avg-17'] = df_avgs_test.loc[:, 'Average of Jan-17':'Average of Dec-17'].mean(axis=1)
df_avgs_test['Avg-18'] = df_avgs_test.loc[:, 'Average of Jan-18':'Average of Dec-18'].mean(axis=1)


# Bring all necessary data together
df_test_full = test.drop(avg_cols, axis=1).merge(
    df_avgs_test, how='left', on=['patient_zip3', 'patient_state']
)

# Categorize variables
df_test_full['age_group'] = pd.cut(df_test_full['patient_age'], right=False, bins=[0, 30, 40, 50, 60, 70, 80, 90, np.inf], labels=[0,1,2,3,4,5,6,7]).astype(int)
df_test_full['icd_9'] = df_test_full['breast_cancer_diagnosis_code'].str.startswith('17').astype(int)

# Include bmi info 
df_test_full['bmi_missing'] = df_test_full['bmi'].isna().astype(int)
df_test_full['bmi_recoded'] = np.where(df_test_full['bmi'].isna(), 0, 
                                  np.where(df_test_full['bmi'] < 18.5, 1, 
                                          np.where(df_test_full['bmi'] < 25, 2, 
                                                 np.where(df_test_full['bmi'] < 30, 3, 4))))

df_test_full.columns = df_test_full.columns.str.replace(' ', '_').str.replace('-', '')

print(df_test_full.shape)
df_test_full.head()

### _Statistics of the cleaned data_ ###

In [None]:
skim(training_full)

### ___Preparing the data for modeling___ ###

In [None]:
X = training_full.drop(['patient_id', 'bmi', 'metastatic_diagnosis_period'], axis=1)
y = training_full['metastatic_diagnosis_period']

print(X.shape, y.shape)
X.head()

In [None]:
cols_categorical = X.columns[X.dtypes == 'object'].to_list()

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit(X[cols_categorical])
X_enc = pd.concat([
    X[X.columns[~X.columns.isin(cols_categorical)]],
    pd.DataFrame(encoder.transform(X[cols_categorical]), columns=cols_categorical)], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_enc, y, random_state=seed, stratify=y, test_size=.2)
print('Training size: ', X_train.shape)
print('Testing size: ', X_test.shape)

X_train.head()

## __Recurcive Feature Elimination using CatBoost Regression__ ##

Code has been adapted from these sources: 
- [catboost] (https://github.com/catboost/catboost/blob/master/catboost/tutorials/feature_selection/select_features_tutorial.ipynb )


In [None]:
# Fit initial model
ctb_full = CatBoostRegressor(
    random_state=seed, 
    verbose=False, 
    eval_metric='RMSE'
).fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)


In [None]:
rerun_rfe = False
if rerun_rfe == True:
    rfe_ctb_full = ctb_full.select_features(
        X                      = X_train, 
        y                      = y_train, 
        eval_set               = (X_test, y_test), 
        features_for_select    = X_train.columns.to_list(),                         # Features that will be selected on the RFE - here, all of them
        num_features_to_select = 10,                                                # Number of features to keep from the selected
        algorithm              = EFeaturesSelectionAlgorithm.RecursiveByShapValues, # Algorithm chosen, see -> 
        steps                  = 20,                                                # Number of model iterations performed in the RFE
        verbose                = False,                                             # Do not print model iterations
        train_final_model      = False,                                             # Train final model after RFE is finished 
        plot                   = True                                               # plot after the RFE is finished -> very helpful for understanding if we can get a better AUC with fewer variables
    )
    
    # Minimum loss corresponds to the lowest RMSE 
    n_todrop = np.argmin(rfe_ctb_full['loss_graph']['loss_values'])
    cols_to_keep = X.drop(rfe_ctb_full['eliminated_features_names'][:n_todrop],axis=1).columns.to_list()
else: 
    cols_to_keep = [
         'patient_age',
         'self_employed',
         'Average_of_Apr13',
         'Average_of_Sep13',
         'Average_of_Aug14',
         'Average_of_Aug16',
         'Average_of_May18',
         'age_group',
         'bmi_missing',
         'bmi_recoded',
         'payer_type',
         'breast_cancer_diagnosis_code',
         'metastatic_cancer_diagnosis_code',
         'icd_9',
         'Region',
         'age_encoded',
         'age_40s',
         'Division',
         'education_highschool',
         'Avg14',
         'education_college_or_above',
         'metastatic_first_novel_treatment',
         'metastatic_first_novel_treatment_type'
        ]
    
X_short = X_enc[cols_to_keep]
X_train_short = X_train[cols_to_keep]
X_test_short = X_test[cols_to_keep]

print(X_short.shape)
X_short.head()

## __TRAINING AN ENSEMBLE MODEL__ 

### _The following model were used_ ###
- CATBOOST
- LIGHTGBM
- LIGHTGBM WITH TWEEDIE
- XGB REGRESSOR
- RANDOM FOREST
- GRADIENT BOOSTING
- ADABOOST
- EXTRA TREES
- K NEIGHBOURS

In [None]:
# 1. CatBoost Regresion (Hypertuned)
ctb = CatBoostRegressor(
    random_state=seed, 
    learning_rate=0.029143404630341967,
    depth=6,
    l2_leaf_reg=2.0599682627368536,
    bagging_temperature=0.13525392267548214,
    verbose= False, 
    eval_metric='RMSE'
).fit(X_train_short, y_train, eval_set=(X_test_short, y_test), use_best_model=True)

In [None]:
# 2. LightGBM Regresion (Hypertuned)
lgbm = LGBMRegressor(
    random_state=seed, 
    learning_rate=.06,
    max_depth = 4,
    verbose=-1
).fit(X_train_short, y_train)


In [None]:
# 3. LightGBM with Tweedie objective (Hypertuned)
lgbm_tw = LGBMRegressor(
    random_state=seed,
    tweedie_variance=1.1, 
    n_estimators=40,
    verbosity=-1,
    objective="tweedie",
    metric="rmse"
).fit(X_train_short, y_train)


In [None]:
# 4. XGB Regression (Hypertuned)
xgb = XGBRegressor(
    random_state=seed,
    learning_rate=0.05075565490876331,
    max_depth=5,
    n_estimators= 132,
    min_child_weight= 3,
    gamma= 0.000162007012716049,
    subsample= 0.8103835140746048,
    colsample_bytree= 0.6747747924854386,
    reg_alpha= 0.0007688985669753765,
    reg_lambda= 0.000177315807077408
).fit(X_train_short, y_train)


In [None]:
# 5. Random Forest Regression (Hypertuned)
rf = RandomForestRegressor(
    random_state=seed,
    n_estimators=81,
    max_depth=9,
    min_samples_split=17,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=False
).fit(X_train_short, y_train)



In [None]:
# 6. Gradient Boosting Regression (Hypertuned)
gb = GradientBoostingRegressor(
    random_state=seed,
    n_estimators=170,
    learning_rate=0.05352169500172268,
    max_depth=4,
    min_samples_split=17,
    min_samples_leaf=4,
    max_features='sqrt'
).fit(X_train_short, y_train)


In [None]:
# 7. AdaBoost Regression (Hypertuned)
ada = AdaBoostRegressor(
    random_state=seed,
    n_estimators=100,
    learning_rate=0.00053678329477655296,
    loss='linear'
).fit(X_train_short, y_train)


In [None]:
# 8. ExtraTrees Regression (Hypertuned)
et = ExtraTreesRegressor(
    random_state=seed,
    n_estimators=100,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features='log2',
    max_depth=10,
    bootstrap=False
).fit(X_train_short, y_train)

In [None]:
# 9. KNeighbors Regressor (Hypertuned)
kn = KNeighborsRegressor(
    n_neighbors=47,
    weights= 'uniform',
    algorithm='brute',
    leaf_size= 72,
    p=2
).fit(X_train_short, y_train)

## __EVALUATE THE MODEL__ ###

### _Funtion Declarations_ ###

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def CV_predict(X, y, newdata, model, custom_cv, stratify_col=None, verbose=True, use_best_model=False):
    oof_preds, test_preds = {}, {}
    scores = []

    for fold, (train_ids, val_ids) in enumerate(custom_cv.split(X, stratify_col)):
        X_tr, y_tr = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
        
        if use_best_model == True:
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                use_best_model=True,
                verbose=False)
            
        elif type(model) == XGBRegressor:
            model.fit(X_tr, y_tr, verbose=False)
            
        else: 
            model.fit(X_tr, y_tr)

        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(newdata)

        score = rmse(y_val, val_preds)
        scores.append(score)
        if verbose:
            if type(model) == CatBoostRegressor:
                print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>4} rounds)')
            else: 
                print(f'Fold #{fold:>2}: {score:.5f}')

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} ± {np.std(scores):.5f}')
    print(f'OOF score: {rmse(y, oof_preds):.5f}')
    
    return oof_preds, test_preds

### _Stratified K Fold Validation_

In [None]:
# First, cross validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
icd = training_full['breast_cancer_diagnosis_code'].str.startswith('17').astype(int)

df_test_enc = pd.concat([
    df_test_full[df_test_full.columns[~df_test_full.columns.isin(cols_categorical)]],
    pd.DataFrame(encoder.transform(df_test_full[cols_categorical]), columns=cols_categorical)], axis=1)

df_test_short = df_test_enc[X_short.columns]

In [None]:

oof_preds_ctb, test_preds_ctb = CV_predict(X_short, y, df_test_short, ctb, cv, use_best_model=True, stratify_col=icd) 
oof_preds_xgb, test_preds_xgb = CV_predict(X_short, y, df_test_short, xgb, cv, stratify_col=icd) 
oof_preds_rf, test_preds_rf = CV_predict(X_short, y, df_test_short, rf, cv, stratify_col=icd) 
oof_preds_gb, test_preds_gb = CV_predict(X_short, y, df_test_short, gb, cv, stratify_col=icd) 
oof_preds_ada, test_preds_ada = CV_predict(X_short, y, df_test_short, ada, cv, stratify_col=icd) 
oof_preds_et, test_preds_et = CV_predict(X_short, y, df_test_short, et, cv, stratify_col=icd) 
oof_preds_kn, test_preds_kn = CV_predict(X_short, y, df_test_short, kn, cv, stratify_col=icd) 
oof_preds_lgbm_tw, test_preds_lgbm_tw = CV_predict(X_short, y, df_test_short, lgbm_tw, cv, stratify_col=icd) 
oof_preds_lgbm, test_preds_lgbm = CV_predict(X_short, y, df_test_short, lgbm, cv, stratify_col=icd) 


### PREPING THE TEST DATASET ###

In [None]:
oof_preds_combined = pd.DataFrame({
    'model1': oof_preds_ctb,
    'model2': oof_preds_xgb,
    'model3': oof_preds_lgbm, 
    'model4': oof_preds_rf,
    'model5': oof_preds_gb,
    'model6': oof_preds_ada,
    'model7': oof_preds_et,
    'model8': oof_preds_lgbm_tw,
    'model9': oof_preds_kn
})

test_preds_combined = pd.DataFrame({
    'model1': test_preds_ctb['mean'],
    'model2': test_preds_xgb['mean'],
    'model3': test_preds_lgbm['mean'],
    'model4': test_preds_rf['mean'],
    'model5': test_preds_gb['mean'],
    'model6': test_preds_ada['mean'],
    'model7': test_preds_et['mean'],
    'model8': test_preds_lgbm_tw['mean'],
    'model9': test_preds_kn['mean']
})
test_preds_combined.head(5)


In [None]:
# Check rmse
oof_preds_combined.apply(lambda x: rmse(y, x))

### _Ridge Cross Validation_

In [None]:
meta_cv = RidgeCV(
    alphas=[.001, .01, .05, .1, 1, 5, 10, 20, 50, 100],
    scoring='neg_root_mean_squared_error', 
    cv=5
)

meta_cv.fit(oof_preds_combined, y)
print('R2 =', meta_cv.score(oof_preds_combined, y))
print('RMSE =', rmse(y, meta_cv.predict(oof_preds_combined))) 
meta_cv.coef_

### _Ridge Regression_

In [None]:
# Cross-validate ridge to use predictions
ridge = Ridge(alpha=meta_cv.alpha_, random_state=seed)
oof_preds_final, test_preds_final = CV_predict(oof_preds_combined, y, test_preds_combined, ridge, cv, stratify_col=icd)

### _Creating a new CSV file to upload_

In [None]:
sub_final = solution_template.copy()
sub_final['metastatic_diagnosis_period'] = test_preds_final['mean']
sub_final['metastatic_diagnosis_period'] = sub_final['metastatic_diagnosis_period'].apply(lambda x: np.clip(x, a_min=0, a_max=np.inf))
sub_final.to_csv('upload.csv', index=False)
sub_final.head()

In [None]:
df = training_full[['patient_id', 'patient_race']]


In [None]:
df