In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports

In [5]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

# Get data

In [6]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

X = train_data.copy()
X.drop(['id', 'target'], axis=1, inplace=True)

y = train_data['target']

no_id_test_data = test_data.drop('id', axis=1)

display(X.head())
display(y.head())

Unnamed: 0,gravity,ph,osmo,cond,urea,calc
0,1.013,6.19,443,14.8,124,1.45
1,1.025,5.4,703,23.6,394,4.18
2,1.009,6.13,371,24.5,159,9.04
3,1.021,4.91,442,20.8,398,6.63
4,1.021,5.53,874,17.8,385,2.21


0    0
1    0
2    0
3    1
4    1
Name: target, dtype: int64

# Split Data

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

# Missing Data

In [8]:
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

print(f'There are {len(cols_with_missing)} columns with missing values')

for i in cols_with_missing:
    print(i)

There are 0 columns with missing values


# Score

In [9]:
def get_score(n_estimators_=100, learning_rate_=0.5, max_depth_=6):
    model = XGBRegressor(n_estimators=n_estimators_, max_depth=max_depth_, learning_rate=learning_rate_, 
                         n_jobs=4)
#     model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=
#              False)
#     predictions = model.predict(X_valid)
    scores = -1*cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error")
    return scores.mean()

# Parameter Tuning

In [10]:
# display(X_train)
# display(y_train)

# best_score=100000000
# best_n_estimators=100
# best_learning_rate=0.1
# best_max_depth=6

# for n_estimators in range(100, 401, 100):
#     score = get_score(n_estimators_=n_estimators)
    
#     if score<best_score:
#         best_score=score
#         best_n_estimators=n_estimators
        
# for learning_rate in [0.05, 0.01, 0.025, 0.05, 0.1]:
#     score = get_score(n_estimators_=best_n_estimators, learning_rate_=learning_rate);
    
#     if score<best_score:
#         best_score=score
#         best_learning_rate=learning_rate
        
# for max_depth in [2, 4, 6, 8, 10]:
#     score = get_score(n_estimators_=best_n_estimators, learning_rate_=learning_rate, 
#                       max_depth_=max_depth)
    
#     if score<best_score:
#         best_score=score
#         best_max_depth=max_depth
        
# print(f'best_score={best_score}, best_n_estimators={best_n_estimators}, best_learning_rate={best_learning_rate}, best_max_depth={best_max_depth}')

# GridSearchCV

In [11]:
# model = XGBRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth, 
#                      learning_rate=best_learning_rate, n_jobs=4)
model = XGBRegressor(random_state=69)
to_search = {'n_estimators': [100, 500, 1000],
            'max_depth': [2, 6, 10], 
            'learning_rate': [0.01, 0.05, 0.1]}

GS = GridSearchCV(estimator=model,
                           param_grid=to_search,
                           scoring=["r2", "neg_root_mean_squared_error"],
                           refit="r2",
                           cv=5,
                           verbose=False)

GS.fit(X, y)

best_params = GS.best_params_

display(best_params)

{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 100}

# Submission

In [13]:
final_model = XGBRegressor(
    n_estimators = best_params['n_estimators'], 
    max_depth = best_params['max_depth'], 
    learning_rate = best_params['learning_rate'],
    n_jobs=4)


score = cross_val_score(final_model, X, y, scoring='roc_auc')
print(f'best score: {score.mean()}')


final_model.fit(X, y)
predictions = final_model.predict(no_id_test_data)

submission = pd.DataFrame({'id': test_data.id, 'target': predictions})

display(submission.head())

submission.to_csv('submission.csv', index=False)

print('Successful')

best score: 0.7841179297067237


Unnamed: 0,id,target
0,414,0.289249
1,415,0.474077
2,416,0.707502
3,417,0.472462
4,418,0.384963


Successful
