# Libraries and Configuration

In [None]:
import numpy as np # linear algebra؟
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
class CFG:
    seed = 4121995
    add_extra = True

In [None]:
BASE_DIR = '/kaggle/input/playground-series-s3e2/'
train = pd.read_csv(BASE_DIR+'train.csv')
test = pd.read_csv(BASE_DIR+'test.csv')
sub = pd.read_csv(BASE_DIR+'sample_submission.csv')

extra = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
from IPython.display import display

# Data

In [None]:
display(train.describe())
display(test.describe())
display(extra.describe())

In [None]:
display(train.describe(include='object'))
display(test.describe(include='object'))
display(extra.describe(include='object'))

In [None]:
extra.isna().sum()

# Split

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

ssf = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=CFG.seed)

X = train.drop(['id', 'stroke'], axis=1).copy()
y = train['stroke'].copy()

(train_idx, val_idx) = list(ssf.split(X, y))[0]

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

X_train.shape, X_val.shape, y_train.mean(), y_val.mean()

In [None]:
if CFG.add_extra:
    X_train = pd.concat([X_train, extra.drop(['id', 'stroke'], axis=1)], axis=0)
    y_train = pd.concat([y_train, extra.stroke], axis=0)

# Train

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

from category_encoders.leave_one_out import LeaveOneOutEncoder

In [None]:
X_train.head()

In [None]:
feats = X.columns.to_list()
cat_feats = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
num_feats = [f for f in feats if f not in cat_feats]

In [None]:
X_train.shape

In [None]:
cat_tfms = LeaveOneOutEncoder(cols=cat_feats, random_state=CFG.seed, sigma=0.05)
feat_tfms = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

pl = Pipeline([
    ('cat', cat_tfms),
    ('all', feat_tfms)
])
    

X_pp = pl.fit_transform(X, y)
test_pp = pl.transform(test.drop('id', axis=1))

In [None]:
# cat_tfms = OneHotEncoder()

# na_feats = ['bmi']

# cont_feats = ['age', 'avg_glucose_level']

# na_tfms = Pipeline([
#     ('mean_imputer', SimpleImputer()),
# #     ('scaler', StandardScaler()),
#     ('indicator', MissingIndicator()),
# ])

# # cont_tfms = StandardScaler()

# pp = ColumnTransformer(transformers=[
#     ('cat', cat_tfms, cat_feats),
#     ('na', na_tfms, na_feats),
# #     ('cont', cont_tfms, cont_feats)
# ], remainder='passthrough')

# pl = Pipeline(steps=[('preprocessor', pp)])

# X_pp = pl.fit_transform(X)


In [None]:
def score(clf, X, y):
    return roc_auc_score(y, clf.predict_proba(X)[:, 1])

## Lasso Regression

In [None]:
def norm(x):
    return (x - x.min())/(x.max() - x.min())

In [None]:
from sklearn.linear_model import LassoCV
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from tqdm.notebook import tqdm


rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=CFG.seed)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=CFG.seed)

test_preds = np.zeros(len(test_pp), dtype=np.float32)
oof_preds = np.zeros_like(y, dtype=np.float32)
scores = []
scores_norm = []

for train_idx, val_idx in tqdm(skf.split(X_pp, y), total=5):
    
    X_train, X_val = X_pp[train_idx], X_pp[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model_llcv = LassoCV(precompute="auto", 
                         fit_intercept=True, 
                         normalize=False,
                         max_iter=1000,
                         verbose=False,
                         eps=1e-04,
                         cv=list(rskf.split(X_train, y_train)),
                         n_alphas=1000,
                         n_jobs=8)

    model_llcv.fit(X_train, y_train)
    
    val_preds = np.clip(model_llcv.predict(X_val), 0, 1)
    
    oof_preds[val_idx] += val_preds
    
    score = roc_auc_score(y_val, val_preds)
    scores.append(score)
    
    # Calculate oof score before normalization
    print('OOF Score before norm:', score)
    
    
    # Calculate oof score after normalization
    val_preds_norm = norm(val_preds)
    score_norm = roc_auc_score(y_val, val_preds_norm)
    scores_norm.append(score_norm)
    
    print('OOF Score after norm:', score_norm)
    
    # Calculate test preds
    test_preds += model_llcv.predict(test_pp) / 5

In [None]:
import matplotlib.pyplot as plt

plt.hist(norm(oof_preds))

In [None]:
plt.hist(norm(test_preds))

In [None]:
sub['stroke'] = norm(test_preds)
sub.to_csv('ll_submission.csv', index=False)

In [None]:
print(" Best alpha value: %.10f" % model_llcv.alpha_)
print(" Intercept: %.10f" % model_llcv.intercept_)
print(" LassoCV score: %.10f" % model_llcv.score(X_pp, y))

In [None]:
roc_auc_score(y, np.clip(model_llcv.predict(X_pp), 0, 1))

## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=50, min_samples_leaf=50, max_depth=10, max_samples=None, class_weight='balanced', random_state=CFG.seed)
rf.fit(X_train_pp, y_train)

score(rf, X_train_pp, y_train), score(rf, X_val_pp, y_val)

## Boosting

In [None]:
import lightgbm as lgbm
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier

In [None]:
from scipy.misc import derivative
import xgboost as xgb
def focal_loss(alpha, gamma):
    def loss_func(y_pred, y_true):
        a, g = alpha, gamma
        def get_loss(y_pred, y_true):
            p = 1 / (1 + np.exp(-y_pred))
            loss = (-(a * y_true + (1 - a)*(1 - y_true)) *
            ((1 - (y_true * p + (1 - y_true) *
            (1 - p)))**g) * (y_true * np.log(p) +
            (1 - y_true) * np.log(1 - p)))
            return loss
        partial_focal = lambda y_pred: get_loss(y_pred, y_true)
        grad = derivative(partial_focal, y_pred, n=1, dx=1e-6)
        hess = derivative(partial_focal, y_pred, n=2, dx=1e-6)
        return grad, hess
    return loss_func

# xgb = xgb.XGBClassifier(objective=focal_loss(alpha=0.25, gamma=1))

In [None]:
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=CFG.seed)

oof_preds = np.zeros_like(y, dtype=np.float32)

for train_idx, val_idx in tqdm(skf.split(X_pp, y, y), total=10):
    X_train, X_val = X_pp[train_idx], X_pp[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    xgb = XGBClassifier(n_estimators=100, seed=CFG.seed)
    xgb.fit(X_train, y_train)
    
    oof_preds[val_idx] += xgb.predict_proba(X_val)[:, 1]

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=CFG.seed)

# ll = Lasso(alpha=0.0002216157, random_state=CFG.seed)
# lr = LogisticRegression()
xgb = XGBClassifier(seed=CFG.seed)
calib = CalibratedClassifierCV(xgb, cv=skf)

calib.fit(X_pp, y)

In [None]:
score(calib, X_pp, y)

In [None]:
from sklearn.model_selection import GridSearchCV


params = {
    'gamma': [1.0,1.5,2.0,2.5,3.0]
}

gridsearch = GridSearchCV(XGBClassifier(), params)

In [None]:
roc_auc_score(y, oof_preds)

In [None]:
xgb = XGBClassifier(objective=focal_loss(alpha=0.25, gamma=1), seed=CFG.seed)
xgb.fit(X_pp, y)

score(xgb, X_pp, y)#, score(xgb, X_val_pp, y_val)

In [None]:
cat = CatBoostClassifier(iterations=100, verbose=0, random_state=CFG.seed)

cat.fit(X_train_pp, y_train)
score(cat, X_train_pp, y_train), score(cat, X_val_pp, y_val)

In [None]:
lgbm = LGBMClassifier(n_estimators=15, random_state=CFG.seed)
lgbm.fit(X_train_pp, y_train)

score(lgbm, X_train_pp, y_train), score(lgbm, X_val_pp, y_val)

In [None]:
ens_preds = 0.25*rf.predict_proba(X_val_pp)[:, 1] \
            + 0.25*xgb.predict_proba(X_val_pp)[:, 1] \
            + 0.25*cat.predict_proba(X_val_pp)[:, 1] \
            + 0.25*lgbm.predict_proba(X_val_pp)[:, 1]

roc_auc_score(y_val, ens_preds)

## Regresssion

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(solver='liblinear', max_iter=10000)
lr.fit(X_train_pp, y_train)

score(lr, X_train_pp, y_train), score(lr, X_val_pp, y_val)

In [None]:
ens_preds = 0.1*rf.predict_proba(X_val_pp)[:, 1] \
            + 0.35*xgb.predict_proba(X_val_pp)[:, 1] \
            + 0.25*cat.predict_proba(X_val_pp)[:, 1] \
            + 0.25*lgbm.predict_proba(X_val_pp)[:, 1] \
            + 0.05*lr.predict_proba(X_val_pp)[:, 1] 

roc_auc_score(y_val, ens_preds)

# Submission

In [None]:
test_pp = pl.transform(test)

test_preds = 0.05*rf.predict_proba(test_pp)[:, 1] \
            + 0.2*xgb.predict_proba(test_pp)[:, 1] \
            + 0.1*cat.predict_proba(test_pp)[:, 1] \
            + 0.1*lgbm.predict_proba(test_pp)[:, 1] \
            + 0.05*lr.predict_proba(test_pp)[:, 1] \
            + 0.5*

sub['stroke'] = test_preds
sub.to_csv('submission.csv', index=False)

In [None]:
test_pp = pl.transform(test.drop('id', axis=1))

sub['stroke'] = np.clip(model_llcv.predict(test_pp), 0, 1)
sub.to_csv('ll_submission.csv', index=False)

In [None]:
test_pp = pl.transform(test.drop('id', axis=1))

sub['stroke'] = calib.predict_proba(test_pp)[:, 1]
sub.to_csv('xgb_submission.csv', index=False)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(model_llcv.predict(test_pp)**(1/1.3));