In [1]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7
[0m

In [2]:
import optuna
from optuna.samplers import TPESampler

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.pipeline import Pipeline

# Models
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold, RepeatedMultilabelStratifiedKFold
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier

In [3]:
import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
file_names = []
directory = '/kaggle/input'  

for dirpath, dirnames, filenames in os.walk(directory):
    for filename in filenames:
        file_path = os.path.join(dirpath, filename)
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        globals()[file_name] = pd.read_csv(file_path)
        #print(file_name)
         
train.drop(columns=["id"],inplace=True)
test.drop(columns=["id"],inplace=True)
mixed_desc.drop(columns=["CIDs"],inplace=True)
col="EC1_EC2_EC3_EC4_EC5_EC6"

mixed_desc[col.split("_")]= mixed_desc[col].str.split('_', expand=True).astype(int)
mixed_desc.drop(col, axis=1, inplace=True)

original = mixed_desc[train.columns]

train = pd.concat([train,original]).reset_index(drop=True)
train.drop(columns=col.split("_")[2:],inplace=True)




In [4]:
train.head()

Unnamed: 0,BertzCT,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3v,Chi4n,EState_VSA1,EState_VSA2,...,PEOE_VSA7,PEOE_VSA8,SMR_VSA10,SMR_VSA5,SlogP_VSA3,VSA_EState9,fr_COO,fr_COO2,EC1,EC2
0,323.390782,9.879918,5.875576,5.875576,4.304757,4.304757,2.754513,1.749203,0.0,11.938294,...,0.0,0.0,17.744066,0.0,4.794537,35.527357,0,0,1,1
1,273.723798,7.259037,4.441467,5.834958,3.285046,4.485235,2.201375,1.289775,45.135471,0.0,...,0.0,0.0,7.822697,30.705892,13.825658,44.70731,0,0,0,1
2,521.643822,10.911303,8.527859,11.050864,6.665291,9.519706,5.824822,1.770579,15.645394,6.606882,...,53.378235,0.0,15.645394,73.143616,17.964475,45.66012,0,0,1,1
3,567.431166,12.453343,7.089119,12.833709,6.478023,10.978151,7.914542,3.067181,95.639554,0.0,...,0.0,6.420822,15.645394,62.107304,31.961948,87.509997,0,0,1,1
4,112.770735,4.414719,2.866236,2.866236,1.875634,1.875634,1.03645,0.727664,17.980451,12.841643,...,19.3864,0.0,11.938611,18.883484,9.589074,33.333333,2,2,1,0


In [5]:
def generate_features(train, test, cat_cols, num_cols):
    
    df = pd.concat([train, test], axis = 0, copy = False)
    
    for c in cat_cols + num_cols:
        
        df[f'count_{c}'] = df.groupby(c)[c].transform('count')
        
    for c in cat_cols:
        for n in num_cols:
                df[f'mean_{n}_per_{c}'] = df.groupby(c)[n].transform('mean')
            
    return df.iloc[:len(train),:], df.iloc[len(train):, :]

In [6]:
target_cols = ['EC1', 'EC2']
cols_to_drop = ['id']

features = [c for c in train.columns if c not in target_cols + cols_to_drop]

cat_cols = ['EState_VSA2','HallKierAlpha','NumHeteroatoms','PEOE_VSA10','PEOE_VSA14','PEOE_VSA6',
            'PEOE_VSA7','PEOE_VSA8', 'SMR_VSA10','SMR_VSA5','SlogP_VSA3','fr_COO','fr_COO2']

num_cols = [c for c in features if c not in cat_cols]

In [7]:
X_train = train[features]
Y_train = train[target_cols]
X_test = test[features]

In [8]:
X_train, X_test = generate_features(X_train, X_test, cat_cols, num_cols)

In [9]:
y  = Y_train
X  = X_train
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
#from sklearn.model_selection import RepeatedMultilabelStratifiedKFold
import numpy as np

# XGBoost classifier parameters
xgb_params = {'n_estimators': 100,
              'tree_method': 'hist',
              'max_depth': 4,
              'min_child_weight': 1,
              'gamma': 2.5705812096617772e-05,
              'learning_rate': 0.07132617944894756,
              'colsample_bytree': 0.11664298814833247,
              'colsample_bynode': 0.9912092923877247,
              'colsample_bylevel': 0.29178614622079735,
              'subsample': 0.7395301853144935,
              'random_state': 42
              }

# LightGBM classifier parameters
lgbm_params = {'n_estimators': 200,
 'boosting_type': 'gbdt',
 'max_depth': 10,
 'min_child_samples': 15,
 'subsample': 0.5182995486972547,
 'learning_rate': 0.027352422199502537,
 'colsample_bytree': 0.2257179878033366,
 'colsample_bynode': 0.7098194984886731,
 'random_state': 84315}

catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 8,
    'l2_leaf_reg': 3,
    'border_count': 32,
    'random_strength': 1,
    'bagging_temperature': 1,
    'random_seed': 42,
    'verbose': 0
}


# Define the classifiers
xgb_classifier = MultiOutputClassifier(XGBClassifier(**xgb_params))
lgbm_classifier = MultiOutputClassifier(LGBMClassifier(**lgbm_params))
catboost_classifier = MultiOutputClassifier(CatBoostClassifier(**catboost_params))

# Create the pipelines
xgb_clf = Pipeline([('classifier', xgb_classifier)])
lgbm_clf = Pipeline([('classifier', lgbm_classifier)])
catboost_clf = Pipeline([('classifier', catboost_classifier)])

# Initialize variables
oof_preds_xgb = np.zeros(y.shape)
oof_preds_lgbm = np.zeros(y.shape)
oof_preds_catboost = np.zeros(y.shape)

test_preds_xgb = np.zeros((test.shape[0], y.shape[1]))
test_preds_lgbm = np.zeros((test.shape[0], y.shape[1]))
test_preds_catboost = np.zeros((test.shape[0], y.shape[1]))

oof_losses_xgb = []
oof_losses_lgbm = []
oof_losses_catboost = []

n_splits = 5
kf = RepeatedMultilabelStratifiedKFold(n_splits=n_splits, n_repeats=1, random_state=42)

train_losses_xgb = []
train_losses_lgbm = []
train_losses_catboost = []

over_train=[]
over_valid=[]
# Loop over folds
for fn, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    print('Starting fold:', fn)
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    # Train and predict with XGBoost classifier
    xgb_clf.fit(X_train, y_train)
    train_preds_xgb = xgb_clf.predict_proba(X_train)
    train_preds_xgb = np.array(train_preds_xgb)[:, :, 1].T
    #train_loss_xgb = roc_auc_score(np.ravel(y_train), np.ravel(train_preds_xgb))
    #train_losses_xgb.append(train_loss_xgb)

    val_preds_xgb = xgb_clf.predict_proba(X_val)
    val_preds_xgb = np.array(val_preds_xgb)[:, :, 1].T
    oof_preds_xgb[val_idx] = val_preds_xgb
    loss_xgb = roc_auc_score(np.ravel(y_val), np.ravel(val_preds_xgb))
    oof_losses_xgb.append(loss_xgb)
    preds_xgb = xgb_clf.predict_proba(X_test)
    preds_xgb = np.array(preds_xgb)[:, :, 1].T
    test_preds_xgb += preds_xgb / n_splits

    
####################################################################
    # Train and predict with LightGBM classifier
    lgbm_clf.fit(X_train, y_train)
    train_preds_lgbm = lgbm_clf.predict_proba(X_train)
    train_preds_lgbm = np.array(train_preds_lgbm)[:, :, 1].T
    #train_loss_lgbm = roc_auc_score(np.ravel(y_train), np.ravel(train_preds_lgbm))
    #train_losses_lgbm.append(train_loss_lgbm)

    val_preds_lgbm = lgbm_clf.predict_proba(X_val)
    val_preds_lgbm = np.array(val_preds_lgbm)[:, :, 1].T
    oof_preds_lgbm[val_idx] = val_preds_lgbm

    loss_lgbm = roc_auc_score(np.ravel(y_val), np.ravel(val_preds_lgbm))
    oof_losses_lgbm.append(loss_lgbm)
    preds_lgbm = lgbm_clf.predict_proba(X_test)
    preds_lgbm = np.array(preds_lgbm)[:, :, 1].T
    test_preds_lgbm += preds_lgbm / n_splits
#########################################################################
    # Train and predict with CatBoost classifier
    catboost_clf.fit(X_train, y_train)
    train_preds_catboost = catboost_clf.predict_proba(X_train)
    train_preds_catboost = np.array(train_preds_catboost)[:, :, 1].T

    val_preds_catboost = catboost_clf.predict_proba(X_val)
    val_preds_catboost = np.array(val_preds_catboost)[:, :, 1].T
    oof_preds_catboost[val_idx] = val_preds_catboost
    loss_catboost = roc_auc_score(np.ravel(y_val), np.ravel(val_preds_catboost))
    oof_losses_catboost.append(loss_catboost)
    preds_catboost = catboost_clf.predict_proba(X_test)
    preds_catboost = np.array(preds_catboost)[:, :, 1].T
    test_preds_catboost += preds_catboost / n_splits



#########################################################################
    
    overall_train_preds = (train_preds_xgb + train_preds_lgbm + train_preds_catboost) / 3
    overall_train_loss = roc_auc_score(np.ravel(y_train), np.ravel(overall_train_preds))
    overall_valid_preds = (val_preds_xgb + val_preds_lgbm + val_preds_catboost) / 3
    overall_valid_loss = roc_auc_score(np.ravel(y_val), np.ravel(overall_valid_preds))
    over_train.append(overall_train_loss)
    over_valid.append(overall_valid_loss)
    print("overall_train_loss", overall_train_loss)
    print("overall_valid_loss", overall_valid_loss)

print("over_train", np.mean(over_train))
print("over_valid", np.mean(over_valid))


Starting fold: 0
overall_train_loss 0.9757272683173904
overall_valid_loss 0.6930149502497583
Starting fold: 1
overall_train_loss 0.9760149298054044
overall_valid_loss 0.6922902441255746
Starting fold: 2
overall_train_loss 0.9758875404016103
overall_valid_loss 0.6764456039524097
Starting fold: 3
overall_train_loss 0.9759512327686974
overall_valid_loss 0.6854850398265779
Starting fold: 4
overall_train_loss 0.9755510145107673
overall_valid_loss 0.6872175255302643
over_train 0.9758263971607739
over_valid 0.686890672736917


In [10]:
sample_submission.iloc[:,1:] = test_preds_xgb*0.5 + test_preds_lgbm*0.3 + test_preds_catboost*0.2

In [11]:
sample_submission.to_csv("submission.csv",index=False)



In [12]:
from IPython.display import FileLink

# Create a download link for the submission.csv file
FileLink('submission.csv')
