In [None]:
# def warn(*args, **kwargs):
#     pass
# import warnings
# warnings.warn = warn
import pandas as pd
import seaborn as sns
import numpy as np


In [None]:
df = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")

In [None]:
df

In [None]:
df.info()

In [None]:
X=df.drop(columns="Class")
Y = df["Class"]

In [None]:
X

# Preparing Custom Model 

In [None]:
from sklearn.metrics import log_loss
def get_metric(true,pred):
    pred_0 = np.array(list(map(lambda d: max(min(d[0],1-10**-5),10**-5),pred)))
    pred_1 = np.array(list(map(lambda d: max(min(d[1],1-10**-5),10**-5),pred)))
    _,truepred_0 = zip(*list(filter(lambda d: d[0] == 0,zip(true,pred_0))))
    _,truepred_1 = zip(*list(filter(lambda d: d[0] == 1,zip(true,pred_1))))

    return -np.sum(np.log(truepred_0))*1/len(truepred_0) / 2 + -np.sum(np.log(truepred_1))*1/len(truepred_1) / 2

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import  SVMSMOTE,SMOTE
from imblearn.under_sampling import ClusterCentroids,RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import QuantileTransformer
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import FastICA
from sklearn.base import ClassifierMixin,BaseEstimator
feat = [
    'AB', 'AF', 'AM', 'AR', 'BC', 'BN', 'BQ', 'CC', 'CD ', 'CR', 'DA',
       'DE', 'DF', 'DH', 'DI', 'DL', 'DU', 'EB', 'EE', 'EH', 'FD ', 'FE',
       'FL', 'GF', 'GL',
    # 'AB', 'AF', 'BQ', 'CR', 'DA', 'DE', 'DF', 'DH', 'GF', 'GL',
 'EJ',
 'Id']
# feat =  ['AB', 'CR', 'DU', 'EH', 'EU', 'AY', 'EB',"Id","EJ"]
class SimpleModel(BaseEstimator, ClassifierMixin):
    def __init__(self,model,smote_params={},model_params={}):  
        self.oh = OneHotEncoder(sparse_output=False)
        self.si=SimpleImputer(strategy="median")
        self.sm = SVMSMOTE(k_neighbors=smote_params["k_neighbors"])
        # self.ica = FastICA(n_components=2)
        # self.km = KMeans(n_clusters=10,max_iter=300,n_init="auto")
        self.under = ClusterCentroids(sampling_strategy=smote_params["sampling_strategy"])
        #self.pca = PCA(7)
        self.std = QuantileTransformer(output_distribution="normal")
        self.model_params = model_params
        self.smote_params = smote_params
        self.rnd = model(**self.model_params)
        self.model = model
    def fit(self,train_x,train_y):
        
        df = train_x.copy(deep=True)
        df = df[feat]
        df.loc[:,["oh_0","oh_1"]] = self.oh.fit_transform(df["EJ"].values.reshape(-1,1))
        df = df.drop(columns=["EJ","Id"])
        df.loc[:,:] = self.si.fit_transform(df)
        self.features_used = df.columns.values
        df.loc[:,:]=self.std.fit_transform(df)
        df,train_y=self.under.fit_resample(df,train_y)
        df,train_y=self.sm.fit_resample(df,train_y)
        # res=self.ica.fit_transform(df)
        # df.loc[:,"cluster"]=self.km.fit_predict(res)
        #df=self.pca.fit_transform(df)
        self.train_df = df
        self.train_y=train_y
        self.rnd.fit(df,train_y)
    def predict(self,test):
        df = test.copy(deep=True)
        df = df[feat]
        df.loc[:,["oh_0","oh_1"]] = self.oh.transform(df["EJ"].values.reshape(-1,1))
        df = df.drop(columns=["EJ","Id"])
        df.loc[:,:] = self.si.transform(df)
        df.loc[:,:]=self.std.transform(df)
        # res=self.ica.fit_transform(df)
        # df["cluster"]=self.km.predict(res)
        #df=self.pca.transform(df)
        self.test_df=df
        return self.rnd.predict_proba(df)
    
    def predict_proba(self,test):
        df = test.copy(deep=True)
        df = df[feat]
        df.loc[:,["oh_0","oh_1"]] = self.oh.transform(df["EJ"].values.reshape(-1,1))
        df = df.drop(columns=["EJ","Id"])
        df.loc[:,:] = self.si.transform(df)
        df.loc[:,:]=self.std.transform(df)
        # res=self.ica.fit_transform(df)
        # df["cluster"]=self.km.predict(res)
        #df=self.pca.transform(df)
        self.test_df=df
        return self.rnd.predict_proba(df)


# HOPT 

In [None]:
import optuna
study=optuna.create_study(study_name="optimise_simple_model",direction="minimize")
def custom_loss(a,b):
    return "Balanced_log_loss",get_metric(b,a)
def objective(trial):
    param_tuning = {
        'learning_rate': trial.suggest_float("learning_rate",0.01, 0.05),
        'max_depth': trial.suggest_int("max_depth",10, 20),
        'min_child_weight':  trial.suggest_int("min_child_weight",1, 5),
        'subsample': trial.suggest_float("subsample",0.5, 0.7),
        'colsample_bytree': trial.suggest_float("colsample_bytree",0.5, 0.7),
        'n_estimators' : trial.suggest_int("n_estimators",700, 1000),
        'scale_pos_weight': trial.suggest_int("scale_pos_weight",30, 150),
        'feval': custom_loss,
        'disable_default_eval_metric': True,
        'eval_metric': custom_loss
        }
    smote_k=trial.suggest_int("k_neighbors",3,7)
    strd = StratifiedKFold(5)
    res = []
    for train,test in strd.split(X,Y):
        X_train = X.iloc[train]
        y_train = Y.iloc[train]
        X_test = X.iloc[test]
        y_test = Y.iloc[test]
        md = SimpleModel(XGBClassifier,{"k_neighbors":smote_k},{**param_tuning})
        md.fit(X_train,y_train)
        # diff = np.power(get_metric(y_test,md.predict(X_test))-get_metric(y_train,md.predict(X_train)),0.2)*get_metric(y_test,md.predict(X_test))
        diff = get_metric(y_test,md.predict(X_test))

        res.append(diff)
    return pd.Series(res).mean()

In [None]:
# optuna.logging.set_verbosity(optuna.logging.INFO)
# study.optimize(objective, n_trials=100,n_jobs=-1)

# Training Model

In [None]:
from sklearn.model_selection import train_test_split
results_df = []
for _ in range(1):
    X_train,X_test,y_train,y_test = train_test_split(df.drop(columns="Class"),df["Class"],stratify=df["Class"],test_size=0.1)
    # best=study.best_params
    best = {'learning_rate': 0.01539347781839023,
    'max_depth': 4,
    'min_child_weight': 3,
    'subsample': 0.592744410208758,
    'colsample_bytree': 0.5058950894912562,
    'reg_lambda': 0.3,
    'n_estimators': 300,
    'k_neighbors': 4}
    # best = {'learning_rate': 0.041644644359684846,
    #  'max_depth': 4,
    #  'min_child_weight': 3,
    #  'subsample': 0.6948901434704582,
    #  'colsample_bytree': 0.5423729152386366,
    #  'n_estimators': 393,
    #  'scale_pos_weight': 13,
    #  'k_neighbors': 3}
    # # #best = {'learning_rate': 0.05227123248453593, 'max_depth': 30, 'min_child_weight': 5, 'subsample': 0.6456166254107153, 'colsample_bytree': 0.50388824472474, 'n_estimators': 406, 'scale_pos_weight': 10, 'k_neighbors': 5}
    # best = {'learning_rate': 0.07586433101301165,
    #  'max_depth': 15,
    #  'min_child_weight': 5,
    #  'subsample': 0.6740763398779215,
    #  'colsample_bytree': 0.6493030093729031,
    #  'n_estimators': 322,
    #  'scale_pos_weight': 32,
    #  'k_neighbors': 7}
    k = best["k_neighbors"]
    del best['k_neighbors']
    md = SimpleModel(XGBClassifier,{"k_neighbors":k,    'sampling_strategy': 0.25},best)
    md.fit(X_train,y_train)
    # {'learning_rate': 0.06739940833623097,
    #  'max_depth': 21,
    #  'min_child_weight': 1,
    #  'subsample': 0.6557470933567124,
    #  'colsample_bytree': 0.6038479944539996,
    #  'n_estimators': 245,
    #  'k_neighbors': 3}
    # best gave me 0.2 on test set
    # {'learning_rate': 0.09158459955906706,
    #  'max_depth': 7,
    #  'min_child_weight': 1,
    #  'subsample': 0.637028962750457,
    #  'colsample_bytree': 0.5875347274927317,
    #  'n_estimators': 169,
    #  'scale_pos_weight': 33,
    #  'k_neighbors': 3}
    # ghave me 0.12 on test
    # {'learning_rate': 0.05883578995073209,
    #  'max_depth': 30,
    #  'min_child_weight': 5,
    #  'subsample': 0.5942208622115294,
    #  'colsample_bytree': 0.5725763835733881,
    #  'n_estimators': 473,
    #  'scale_pos_weight': 115,
    #  'k_neighbors': 4}
    # 0.11
    #-- consistent
    results_df.append({"test":get_metric(y_test,md.predict(X_test)),"train":get_metric(y_train,md.predict(X_train))})
distribution = pd.DataFrame(results_df).stack().to_frame().reset_index(0,drop=True).reset_index()
sns.histplot(data=distribution,x=0,hue="index",kde=True)

In [None]:
# print(get_metric(y_test,md.predict(X_test)),get_metric(y_train,md.predict(X_train)))

In [None]:
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
brnd = SimpleModel(RandomForestClassifier,{"k_neighbors":k,'sampling_strategy': 0.25},{"n_estimators": 300,"max_depth": 20})


In [None]:
brnd.fit(X_train,y_train)
get_metric(y_test,brnd.predict(X_test)),get_metric(y_test,md.predict(X_test))

# Evaluating using Voting Classifier

In [None]:
vt = VotingClassifier([('xgb',md),('rnd',brnd)],voting="soft",weights=[1,2])
vt.fit(X_train,y_train)
get_metric(y_test,vt.predict_proba(X_test))

In [None]:
vt.fit(df.drop(columns=["Class"]),df["Class"])

In [None]:
df_pred=pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")
y_p = pd.DataFrame(vt.predict_proba(df_pred))[1]
pd.DataFrame({"class_0":(1-y_p).values,"class_1":y_p.values},index=df_pred["Id"]).to_csv("submission.csv")

In [None]:
# sns.barplot(x=md.features_used,y=vt.model.feature_importances_)
# pd.DataFrame({'x':md.features_used,'y':md.rnd.feature_importances_}).sort_values("y",ascending=False).iloc[0:8,:].x.values