In [55]:
# score: 0.82666


# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports

In [56]:
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import optuna

# Get data

In [57]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

X = train_data.copy()
X.drop(['id', 'target'], axis=1, inplace=True)

y = train_data['target']

no_id_test_data = test_data.drop('id', axis=1)

display(X.head())
display(y.head())

Unnamed: 0,gravity,ph,osmo,cond,urea,calc
0,1.013,6.19,443,14.8,124,1.45
1,1.025,5.4,703,23.6,394,4.18
2,1.009,6.13,371,24.5,159,9.04
3,1.021,4.91,442,20.8,398,6.63
4,1.021,5.53,874,17.8,385,2.21


0    0
1    0
2    0
3    1
4    1
Name: target, dtype: int64

# Split Data

In [58]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

# Missing Data

In [59]:
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

print(f'There are {len(cols_with_missing)} columns with missing values')

for i in cols_with_missing:
    print(i)

There are 0 columns with missing values


# Feature Engineering

In [60]:
display(X.columns)

def prepXy(df,X_label,y_label=None):
    selected_feat = []
    data = df.copy()
    
    # Feature Engineering
    
    
    # Ion product of calcium and urea
    data["ion_product"] = data["calc"] * data["urea"]

    # Calcium-to-urea ratio
    data["calcium_to_urea_ratio"] = data["calc"] / data["urea"]

    # Electrolyte balance
    data["electrolyte_balance"] = data["cond"] / (10 ** (-data["ph"]))

    # Osmolality-to-specific gravity ratio
    data["osmolality_to_sg_ratio"] = data["osmo"] / data["gravity"]

    # The product of osmolarity and density is created as a new property
    data['osmo_density'] = data['osmo'] * data['gravity']
    
    # ******************************************************************
    
    # Calculate ammonium concentration (assuming ammonium is not directly measured)
    data['ammonium_concentration'] = (data['gravity'] - 1.010) * (140 - (2 * data["ph"])) * 1.2

    # Calculate phosphate concentration (assuming phosphate is not directly measured)
    data['phosphate_concentration'] = (data['gravity'] - 1.010) * (32 - (0.06 * data["ph"]))

    # *****************************************************************
    
    data["gravity/ph"] = data["gravity"] / data["ph"]
    data["osmo/cond"] = data["osmo"] / data["cond"]
    data["gravity*ph"] = data["gravity"] * data["ph"]
    data["osmo*urea"] = data["osmo"] * data["urea"]
    data["cond_urea_ph"] = data["cond"] * data["urea"] / data["ph"]
    data["ph*osmo"] = data["ph"] * data["osmo"]
    data["cond*calc"] = data["cond"] * data["calc"]
    data["gravity/calc"] = data["gravity"] / data["calc"]

    # Split into features and target
    X_ret = data[X_label].values
    
    if y is None:
        y_ret = np.zeros(data.shape[0])
    else:
        y_ret = np.ravel(y)

    display(data.columns)

    # Scaling dataset
    # scaler = MinMaxScaler()
    scaler = StandardScaler()
    
    X_ret = scaler.fit_transform(X_ret)

    return data,X_ret,y_ret

def feature_engineering(X, y=None):
    # data.drop('ph', axis=1, inplace=True)
    # display(data.columns)
    
    new_X, X_ret, y_ret = prepXy(X, y);
    
    return new_X, y_ret



print(X.count());

Index(['gravity', 'ph', 'osmo', 'cond', 'urea', 'calc'], dtype='object')

gravity    414
ph         414
osmo       414
cond       414
urea       414
calc       414
dtype: int64


# Remove Outliers

In [61]:
# Credit: KLYUSHNIK-ALEXSANDR

def outlier_removal(data , i):
    q1 = np.quantile(data[i] ,0.25)
    q3 = np.quantile(data[i] , 0.75)
    iqr = q3-q1
    lower_tail = q1 - 1.5*iqr
    upper_tail = q3 + 1.5*iqr
    
    
    for j in range(0, data[i].count()):
        if(data[i][j]>upper_tail or data[i][j]<lower_tail):
            data[i][j]=np.nan
            
    return data[i]

# Aggregate Features

In [62]:
def aggregate_features(X):
    
    outlier_list = ['gravity', 'osmo', 'cond', 'urea', 'calc']
    for i in outlier_list:
        X[i]=outlier_removal(X ,i)
    
    my_imputer = SimpleImputer()
    X = pd.DataFrame(my_imputer.fit_transform(X))
    return X

# Preprocess Data

In [63]:
def preprocess_data(X, y=None):
    
    # X = aggregate_features(X);
    
    print(X.columns)
    return prepXy(X, X.columns, y)


'''preprocess data here'''
new_df_train, X, y = preprocess_data(X, y)
new_df_test, no_id_test_data, _ = preprocess_data(no_id_test_data)

display(X)

# cols_with_missing = [col for col in X.columns if X[col].isnull().any()]

# print(f'There are {len(cols_with_missing)} columns with missing values')

# for i in cols_with_missing:
#     print(i)
    
    
# print(X.count());



Index(['gravity', 'ph', 'osmo', 'cond', 'urea', 'calc'], dtype='object')


Index(['gravity', 'ph', 'osmo', 'cond', 'urea', 'calc', 'ion_product',
       'calcium_to_urea_ratio', 'electrolyte_balance',
       'osmolality_to_sg_ratio', 'osmo_density', 'ammonium_concentration',
       'phosphate_concentration', 'gravity/ph', 'osmo/cond', 'gravity*ph',
       'osmo*urea', 'cond_urea_ph', 'ph*osmo', 'cond*calc', 'gravity/calc'],
      dtype='object')

Index(['gravity', 'ph', 'osmo', 'cond', 'urea', 'calc'], dtype='object')


Index(['gravity', 'ph', 'osmo', 'cond', 'urea', 'calc', 'ion_product',
       'calcium_to_urea_ratio', 'electrolyte_balance',
       'osmolality_to_sg_ratio', 'osmo_density', 'ammonium_concentration',
       'phosphate_concentration', 'gravity/ph', 'osmo/cond', 'gravity*ph',
       'osmo*urea', 'cond_urea_ph', 'ph*osmo', 'cond*calc', 'gravity/calc'],
      dtype='object')

array([[-0.73403157,  0.36562268, -0.8897276 , -0.88438794, -1.13486936,
        -0.82913603],
       [ 1.06590369, -0.86589692,  0.21952069,  0.28805926,  0.84638411,
         0.02033832],
       [-1.33400999,  0.27208954, -1.19690404,  0.40796863, -0.8780402 ,
         1.53258937],
       ...,
       [ 0.01594146,  0.50592238, -0.83853152,  0.10153357, -0.06352489,
         1.109408  ],
       [-1.48400459,  1.81538625, -1.39315566, -1.17749974, -1.49443017,
        -0.9598244 ],
       [-1.03402078,  0.27208954, -1.22676842, -1.53722786, -0.8780402 ,
        -1.1963081 ]])

# Hyperparameter Tuning

In [64]:
def optuna_study(X, y):

    def objective(trial):
        n_estimators = trial.suggest_int('n_estimators', 10, 1000)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 1.0)
        max_depth = trial.suggest_int('max_depth', 1, 30)
        
        rf = XGBRegressor(n_estimators=n_estimators,
                        learning_rate=learning_rate,
                        max_depth=max_depth)
        
        # # score = StratifiedKFold(rf, X, y)
        
        # kfold = RepeatedStratifiedKFold(n_splits=40, n_repeats=6, random_state=69);
        
        # scores = []
        
        # for train_idx, test_idx in kfold.split(X, y):
        #     X_train, X_test = X[train_idx], X[test_idx]
        #     y_train, y_test = y[train_idx], y[test_idx]
        #     rf.fit(X_train, y_train)
        #     y_pred = rf.predict(X_test)
        #     scores.append(roc_auc_score(y_test, y_pred))
        
        # return np.mean(scores)

        score = cross_val_score(rf, X, y, n_jobs=4, cv=5)
        return score.mean()

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=200)
    
    return study.best_params

# Study 1 (no ph + remove outlier + aggregate values, 100 trials): 0.1564147933997004
# Study 2 (no preprocessing, 100 trials): 0.12094767907088448
# Study 3 (preprocessing kudos to MOHAMMAD RAZEGHI, 200 trials): 0.18433600629551264
# Study 4 (same as 3 but max_depth allowed to be 1, 
# {'n_estimators': 31, 'learning_rate': 0.15241724886836167, 'max_depth': 1}): 0.22565032154969472


best_params = optuna_study(X, y)

display(best_params)

[32m[I 2023-04-15 16:56:47,231][0m A new study created in memory with name: no-name-7ddd4b39-2468-40b4-8aa5-c7b0aadeccbc[0m
[32m[I 2023-04-15 16:56:47,610][0m Trial 0 finished with value: -0.004504193719279348 and parameters: {'n_estimators': 509, 'learning_rate': 0.26002292048122694, 'max_depth': 24}. Best is trial 0 with value: -0.004504193719279348.[0m
[32m[I 2023-04-15 16:56:47,864][0m Trial 1 finished with value: -0.09039170856031184 and parameters: {'n_estimators': 423, 'learning_rate': 0.7836567555675614, 'max_depth': 19}. Best is trial 0 with value: -0.004504193719279348.[0m
[32m[I 2023-04-15 16:56:48,227][0m Trial 2 finished with value: -0.24148920382539005 and parameters: {'n_estimators': 758, 'learning_rate': 0.779598156255552, 'max_depth': 4}. Best is trial 0 with value: -0.004504193719279348.[0m
[32m[I 2023-04-15 16:56:48,575][0m Trial 3 finished with value: -0.21945829092812952 and parameters: {'n_estimators': 942, 'learning_rate': 0.9682080277387198, 'max_d

{'n_estimators': 12, 'learning_rate': 0.3309328967885372, 'max_depth': 1}

# Submission

In [65]:
final_model = XGBRegressor(
    n_estimators = best_params['n_estimators'], 
    max_depth = best_params['max_depth'], 
    learning_rate = best_params['learning_rate'],
    n_jobs=4)

final_model.fit(X, y)
predictions = final_model.predict(no_id_test_data)

submission = pd.DataFrame({'id': test_data.id, 'target': predictions})

display(submission.head())

submission.to_csv(f'submission6.csv', index=False)

print('Successful')

Unnamed: 0,id,target
0,414,0.182349
1,415,0.447463
2,416,0.793484
3,417,0.447463
4,418,0.259589


Successful
