In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from xgboost import XGBRegressor
# import lightgbm as lgb
# from sklearn.ensemble import GradientBoostingRegressor
import optuna
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
train = pd.read_csv("../input/30-days-of-ml/train.csv")
test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
print(f'Train Shape: {train.shape}\nTest Shape: {test.shape}')

In [None]:
cat_cols = [col for col in train.columns if train[col].dtype=='object']
cat_cols

In [None]:
cont_cols = [col for col in train.columns 
             if train[col].dtype != 'object' and col not in ('id', 'target')]
cont_cols

In [None]:
train[cont_cols].describe()

In [None]:
train[cont_cols].hist(figsize=(20,12), color='g')
plt.show()

In [None]:
corr = train[cont_cols].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
from sklearn.preprocessing import MinMaxScaler
# perform a robust scaler transform of the dataset
trans = MinMaxScaler()
data = trans.fit_transform(train[cont_cols])
# convert the array back to a dataframe
train[cont_cols] = pd.DataFrame(data)
# summarize
print(train[cont_cols].describe())


In [None]:
train[cont_cols].hist(figsize=(20,12), color='g')
plt.show()

In [None]:
data = trans.transform(test[cont_cols])
# convert the array back to a dataframe
test[cont_cols] = pd.DataFrame(data)

In [None]:
for cat in cat_cols:
    print(cat)
    print(train[cat].value_counts())
    print('*' * 50)

## descritize some categorical features

some categorical features has one class with the highest frequency.

In [None]:
top_class = {}
OH_cat = ['cat2', 'cat3', 'cat4', 'cat6', 'cat7']
for cat in OH_cat:
    # top class
    top_class[cat] = train[cat].value_counts().index[0]
    train[f'{cat}_{top_class[cat]}'] = (train[cat] == top_class[cat]).astype(int)
    train.drop(columns=cat, inplace=True)
    
train.head()

In [None]:
OH_cat = ['cat2', 'cat3', 'cat4', 'cat6', 'cat7']
for cat in OH_cat:
    # top class
    test[f'{cat}_{top_class[cat]}'] = (test[cat] == top_class[cat]).astype(int)
    test.drop(columns=cat, inplace=True)

In [None]:
cat_cols = [col for col in train.columns if train[col].dtype=='object']
cont_cols = [col for col in train.columns 
             if train[col].dtype != 'object' and col not in ('id', 'target')]

## remove outliers

In [None]:
train['target'].hist()

In [None]:
# calculate summary statistics
mean = train['target'].mean() 
std = train['target'].std()
# identify outliers
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off

outliers = train[(train['target'] > upper) | (train['target'] < lower)]
# dropping outliers
train.drop(outliers.index.to_list(), inplace=True)
train.shape

In [None]:
# calculate interquartile range
q25, q75 = np.percentile(train['target'], 25), np.percentile(train['target'], 75)
iqr = q75 - q25

# calculate the outlier cutoff
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off

# calculate the outlier cutoff
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off

outliers = train[(train['target'] > upper) | (train['target'] < lower)]
# dropping outliers
train.drop(outliers.index.to_list(), inplace=True)
train.shape

In [None]:
train['target'].hist()

## check missing values

In [None]:
print('Number of missing values in trainset: ', train.isna().sum()[train.isna().sum() > 0])
print('Number of missing values in testset: ', test.isna().sum()[test.isna().sum() > 0])

## encode categorical features


In [None]:
ordinal_encoder = OrdinalEncoder()
train[cat_cols] = ordinal_encoder.fit_transform(train[cat_cols])
test[cat_cols] = ordinal_encoder.transform(test[cat_cols])

## train model

In [None]:
params={'n_estimators': 3000,
 'max_depth': 10,
 'learning_rate': 0.1426009017992351,
 'gamma': 3,
 'min_child_weight': 190,
 'subsample': 0.8,
 'colsample_bytree': 0.08825496788241148,
 'reg_alpha': 25,
 'reg_lambda': 40}
xgb_reg = XGBRegressor(objective='reg:squarederror',n_jobs=-1, random_state=42, **params)


In [None]:
features = train.drop(columns=['id', 'target'])
target = train['target']

In [None]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []
final_preds = []

for train_index, valid_index in kf.split(features):
    X_train, X_valid = features.iloc[train_index], features.iloc[valid_index] 
    y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
    xgb_reg.fit(X_train, y_train, early_stopping_rounds=300, eval_set=[(X_valid, y_valid)], verbose=1000)
#     lgbm_reg.fit(X_train, y_train, early_stopping_rounds=300, eval_set=[(X_valid, y_valid)], verbose=1000)
    # validation prediction
    preds_valid = pd.DataFrame(index=X_valid.index)
    preds_valid['xgb'] = xgb_reg.predict(X_valid)

    rmse_scores.append(mean_squared_error(y_valid, preds_valid['xgb'], squared=False))
    
    # test prediction
    preds_test = pd.DataFrame(test['id'])
    preds_test['xgb'] = xgb_reg.predict(test.drop(columns=['id']))


    final_preds.append(preds_test['xgb'])
print(rmse_scores)


In [None]:
print(np.column_stack(rmse_scores).mean(axis=1))

In [None]:
# xgb_reg.fit(features, target, verbose=1000)

# # test prediction
# preds_test = pd.DataFrame(test['id'])
# preds_test['xgb'] = xgb_reg.predict(test.drop(columns=['id']))


## submit 

In [None]:
# sample_submission['target'] = preds_test['xgb'] 
sample_submission['target'] = np.column_stack(final_preds).mean(axis=1)

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('submission.csv', index=False)