In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports

In [16]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
import optuna

# Get data

In [17]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

X = train_data.copy()
X.drop(['id', 'target'], axis=1, inplace=True)

y = train_data['target']

no_id_test_data = test_data.drop('id', axis=1)

display(X.head())
display(y.head())

Unnamed: 0,gravity,ph,osmo,cond,urea,calc
0,1.013,6.19,443,14.8,124,1.45
1,1.025,5.4,703,23.6,394,4.18
2,1.009,6.13,371,24.5,159,9.04
3,1.021,4.91,442,20.8,398,6.63
4,1.021,5.53,874,17.8,385,2.21


0    0
1    0
2    0
3    1
4    1
Name: target, dtype: int64

# Split Data

In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

# Missing Data

In [19]:
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

print(f'There are {len(cols_with_missing)} columns with missing values')

for i in cols_with_missing:
    print(i)

There are 0 columns with missing values


# Feature Engineering

In [20]:
display(X_train.columns)

X_train.drop('ph', axis=1, inplace=True)
no_id_test_data.drop(['ph'], axis=1, inplace=True)

display(X_train.columns)

print(X_train.count());

Index(['gravity', 'ph', 'osmo', 'cond', 'urea', 'calc'], dtype='object')

Index(['gravity', 'osmo', 'cond', 'urea', 'calc'], dtype='object')

gravity    414
osmo       414
cond       414
urea       414
calc       414
dtype: int64


# Remove Outliers

In [21]:
# Credit: KLYUSHNIK-ALEXSANDR

def outlier_removal(data , i):
    q1 = np.quantile(data[i] ,0.25)
    q3 = np.quantile(data[i] , 0.75)
    iqr = q3-q1
    lower_tail = q1 - 1.5*iqr
    upper_tail = q3 + 1.5*iqr
    
    
    for j in range(0, data[i].count()):
        if(data[i][j]>upper_tail or data[i][j]<lower_tail):
            data[i][j]=np.nan
            
    return data[i]

# Aggregate Features

In [22]:
def aggregate_features(data):
    
    outlier_list = ['gravity', 'osmo', 'cond', 'urea', 'calc']
    for i in outlier_list:
        data[i]=outlier_removal(data ,i)
    
    my_imputer = SimpleImputer()
    data = pd.DataFrame(my_imputer.fit_transform(data))
    return data

# Preprocess Data

In [23]:
def preprocess_data(data):
    return aggregate_features(data)


'''preprocess data here'''
X_train = preprocess_data(X_train)
no_id_test_data = preprocess_data(no_id_test_data)

cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

print(f'There are {len(cols_with_missing)} columns with missing values')

for i in cols_with_missing:
    print(i)
    
    
print(X_train.count());



There are 0 columns with missing values
0    414
1    414
2    414
3    414
4    414
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i][j]=np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i][j]=np.nan


# Score

In [24]:
def get_score(n_estimators_=100, learning_rate_=0.5, max_depth_=6):
    model = XGBRegressor(n_estimators=n_estimators_, max_depth=max_depth_, learning_rate=learning_rate_, 
                         n_jobs=4)
#     model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=
#              False)
#     predictions = model.predict(X_valid)
    scores = -1*cross_val_score(model, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")
    return scores.mean()

# shittiest parameter tuning i've ever seen

In [25]:
# display(X_train)
# display(y_train)

# best_score=100000000
# best_n_estimators=100
# best_learning_rate=0.1
# best_max_depth=6

# for n_estimators in range(100, 401, 100):
#     score = get_score(n_estimators_=n_estimators)
    
#     if score<best_score:
#         best_score=score
#         best_n_estimators=n_estimators
        
# for learning_rate in [0.05, 0.01, 0.025, 0.05, 0.1]:
#     score = get_score(n_estimators_=best_n_estimators, learning_rate_=learning_rate);
    
#     if score<best_score:
#         best_score=score
#         best_learning_rate=learning_rate
        
# for max_depth in [2, 4, 6, 8, 10]:
#     score = get_score(n_estimators_=best_n_estimators, learning_rate_=learning_rate, 
#                       max_depth_=max_depth)
    
#     if score<best_score:
#         best_score=score
#         best_max_depth=max_depth
        
# print(f'best_score={best_score}, best_n_estimators={best_n_estimators}, best_learning_rate={best_learning_rate}, best_max_depth={best_max_depth}')

# Optuna!!!! OMG i love optuna so much

In [26]:
def optuna_study(X_train, y_train):

    def objective(trial):
        n_estimators = trial.suggest_int('n_estimators', 10, 1000)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 1.0)
        max_depth = trial.suggest_int('max_depth', 5, 30)
        
        rf = XGBRegressor(n_estimators=n_estimators,
                        learning_rate=learning_rate,
                        max_depth=max_depth)
        
        score = cross_val_score(rf, X_train, y_train, n_jobs=4, cv=5)
        accuracy = score.mean()
        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=200)
    
    return study.best_params

# Study 1 (no ph + remove outlier + aggregate values, 100 trials): 0.1564147933997004
# Study 2 (no preprocessing, 100 trials): 0.12094767907088448

best_params = optuna_study(X_train, y_train)

[32m[I 2023-04-15 08:52:17,798][0m A new study created in memory with name: no-name-755e5739-b116-4978-9918-a3d86b87edbd[0m
[32m[I 2023-04-15 08:52:18,178][0m Trial 0 finished with value: -0.10963875178313684 and parameters: {'n_estimators': 632, 'learning_rate': 0.36805845266711473, 'max_depth': 13}. Best is trial 0 with value: -0.10963875178313684.[0m
[32m[I 2023-04-15 08:52:18,578][0m Trial 1 finished with value: -0.10730284637245853 and parameters: {'n_estimators': 702, 'learning_rate': 0.317562515485912, 'max_depth': 26}. Best is trial 1 with value: -0.10730284637245853.[0m
[32m[I 2023-04-15 08:52:19,120][0m Trial 2 finished with value: -0.09563575976093312 and parameters: {'n_estimators': 992, 'learning_rate': 0.20313876121388066, 'max_depth': 23}. Best is trial 2 with value: -0.09563575976093312.[0m
[32m[I 2023-04-15 08:52:19,594][0m Trial 3 finished with value: -0.10771987875769129 and parameters: {'n_estimators': 573, 'learning_rate': 0.3555187351047406, 'max_dep

# GridSearchCV

In [27]:
display(X_train.columns)

# model = XGBRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth, 
#                      learning_rate=best_learning_rate, n_jobs=4)

def grid_search_cv(X_train, y_train):

    model = XGBRegressor(random_state=69)
    to_search = {'n_estimators': [100, 500, 1000],
                'max_depth': [2, 6, 10], 
                'learning_rate': [0.01, 0.05, 0.1]}

    GS = GridSearchCV(estimator=model,
                            param_grid=to_search,
                            scoring=["r2", "neg_root_mean_squared_error"],
                            refit="r2",
                            cv=5,
                            verbose=False)

    GS.fit(X_train, y_train)

    return GS.best_params_

# best_params = grid_search_cv(X, y)

# display(best_params)

RangeIndex(start=0, stop=5, step=1)

# Model Accuracy

In [None]:
def model_accuracy(model, X_valid, y_valid):
    score = cross_val_score()

# Submission

In [28]:
final_model = XGBRegressor(
    n_estimators = best_params['n_estimators'], 
    max_depth = best_params['max_depth'], 
    learning_rate = best_params['learning_rate'],
    n_jobs=4)

final_model.fit(X, y)
predictions = final_model.predict(no_id_test_data)

submission = pd.DataFrame({'id': test_data.id, 'target': predictions})

display(submission.head())

submission.to_csv('submission3.csv', index=False)

print('Successful')

Unnamed: 0,id,target
0,414,0.199863
1,415,0.560994
2,416,0.55738
3,417,0.504529
4,418,0.418669


Successful
