In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e18/sample_submission.csv
/kaggle/input/playground-series-s3e18/train.csv
/kaggle/input/playground-series-s3e18/test.csv


In [15]:
train = pd.read_csv('/kaggle/input/playground-series-s3e18/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e18/test.csv')
from scipy.stats import pearsonr

# Calculate correlation between EC1 and EC2
corr, _ = pearsonr(train['EC1'], train['EC2'])

# Since correlation is low at -0.146, we attempt to treat the two labels as independent
X = train.drop(['id', 'EC1', 'EC2', 'EC3', 'EC4', 'EC5', 'EC6'], axis = 1)
y1 = train['EC1']
y2 = train['EC2']

test.head()


Unnamed: 0,id,BertzCT,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3v,Chi4n,EState_VSA1,...,PEOE_VSA14,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,SMR_VSA10,SMR_VSA5,SlogP_VSA3,VSA_EState9,fr_COO,fr_COO2
0,14838,344.632371,7.283603,4.473966,5.834958,3.412257,4.65153,2.096558,1.116433,49.458581,...,13.512441,0.0,0.0,0.0,26.809272,24.5398,4.794537,47.304082,1,1
1,14839,1432.410201,10.663869,7.079026,8.065215,5.297097,5.297097,3.924155,2.569694,0.0,...,0.0,34.947374,98.323987,9.606882,0.0,53.378235,0.0,43.166667,0,0
2,14840,83.352608,3.931852,1.774215,1.774215,1.073446,1.073446,0.46783,0.170838,5.969305,...,5.969305,0.0,0.0,6.420822,11.75255,13.344559,9.589074,24.666667,1,1
3,14841,150.255712,5.91279,3.548812,3.548812,2.595128,2.595128,1.642813,0.694113,0.0,...,59.935299,0.0,0.0,0.0,17.744066,32.290168,4.794537,26.778866,0,0
4,14842,1817.276351,24.91094,15.540529,20.047314,12.535886,17.730988,11.979618,4.431173,84.554972,...,23.468091,25.609359,0.0,37.099,69.141353,38.70413,50.697492,102.583333,0,0


In [8]:
from sklearn.model_selection import train_test_split

# Split into training and validation set
X1_train, X1_val, y1_train, y1_val = train_test_split(X, y1)
X2_train, X2_val, y2_train, y2_val = train_test_split(X, y2)

In [9]:
!pip install optuna

[0m

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# Flag for optimisation loop
flag = False

# Use optuna to find optimal no. of features to extract, and best XGBoost hyperparams
import optuna

# For EC1
def objective(trial):
    
    num_features = trial.suggest_int('num_features', 10, 31)
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 1, 15)
    learning_rate = trial.suggest_float('learning_rate', 0.0001, 0.01, log=True)
    gamma = trial.suggest_float('gamma', 0, 10)
    subsample = trial.suggest_float('subsample', 0.5, 1)
    
    # Select top num_features most relevant features
    fvalue_selector1 = SelectKBest(f_classif, k=num_features)

    # Apply the SelectKBest object to the features and target
    X_kbest1 = pd.DataFrame(fvalue_selector1.fit_transform(X1_train.copy(), y1_train))
    
    
    XGB = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, eta=learning_rate, objective='binary:logistic', gamma=gamma, subsample=subsample)
    # fit model
    XGB.fit(X_kbest1, y1_train)
    
    # Transform validation set and produce result
    X_kbestval = pd.DataFrame(fvalue_selector1.transform(X1_val))
    pred = XGB.predict(X_kbestval)
    
    try:
        roc_auc = roc_auc_score(XGB.predict(X_kbestval), y1_val)
    except:
        return 0
    else:
        return roc_auc


if flag:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20)
    trial1 = study.best_trial


    print('EC1 Best auc: {}'.format(trial1.value))
    print("Best hyperparameters: {}".format(trial1.params))


In [11]:
# For EC2
def objective(trial):
    
    num_features = trial.suggest_int('num_features', 10, 15)
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 1, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.0001, 0.01, log=True)
    gamma = trial.suggest_float('gamma', 0, 10)
    subsample = trial.suggest_float('subsample', 0.5, 1)
    
    # Select top num_features most relevant features
    fvalue_selector2 = SelectKBest(f_classif, k=num_features)

    # Apply the SelectKBest object to the features and target
    X_kbest2 = pd.DataFrame(fvalue_selector2.fit_transform(X2_train.copy(), y2_train))
    
    
    XGB = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, eta=learning_rate, objective='binary:logistic', gamma=gamma, subsample=subsample)
    # fit model
    XGB.fit(X_kbest2, y2_train)
    
    # Transform validation set and produce result
    X_kbestval = pd.DataFrame(fvalue_selector2.transform(X2_val))
    pred = XGB.predict(X_kbestval)
    
    try:
        roc_auc = roc_auc_score(XGB.predict(X_kbestval), y2_val)
    except:
        return 0
    else:
        return roc_auc

if flag:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20)
    trial2 = study.best_trial


    print('EC2 Best auc: {}'.format(trial2.value))
    print("Best hyperparameters: {}".format(trial2.params))

In [12]:

# Best parameters for EC1, uses all 31 available features
num_features_EC1 = 31
XGB1 = XGBClassifier(n_estimators=309, max_depth=3, eta=0.007895285421241401, objective='binary:logistic', gamma=2.0683433342683584, subsample=0.9913940145908703)

# Best parameters for EC2, uses top 15 features
num_features_EC2 = 15
XGB2 = XGBClassifier(n_estimators=991, max_depth=8, eta=0.002164036382215584, objective='binary:logistic', gamma=2.2947040482736365, subsample=0.9010902054341825)

In [16]:
X_test = test.drop(['id'], axis = 1)

# Final model and predictions
XGB1.fit(X, y1)
pred_EC1 = XGB1.predict(X_test)

fvalue_selector = SelectKBest(f_classif, k=num_features_EC2)
# Apply the SelectKBest object to the features and target
X_kbest = pd.DataFrame(fvalue_selector.fit_transform(X.copy(), y2))
XGB2.fit(X_kbest, y2)

X_kbest_test = pd.DataFrame(fvalue_selector.transform(X_test))
pred_EC2 = XGB2.predict(X_kbest_test)

output = pd.DataFrame({'id': test.id, 'EC1': pred_EC1, 'EC2':pred_EC2})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")


Your submission was successfully saved!
