In [17]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics

from xgboost import XGBRegressor

import optuna

In [18]:
random_state = 42
KF_split = 7
KF_split_optuna = 5
optuna_trials = 15

optuna.logging.set_verbosity(optuna.logging.WARNING) #stop showing each trial result

In [19]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

# Creating a DataFrame for Blending
y_valid_pred_blnd = pd.DataFrame(data=df_train.target,index=df_train.index).reset_index()
y_test_pred_blnd = pd.DataFrame(index=df_test.index)

In [4]:
df_train.head()

In [5]:
def optuna_tuning_fitting(model_name):
    
    # Optimizing Using Optuna
    def objective(trial):
        
#         params = {
#             'n_estimators': trial.suggest_int('n_estimators', 100, 7000),
#             'learning_rate': trial.suggest_loguniform('learning_rate',0.005,0.5),
#             'max_depth': trial.suggest_int('max_depth', 1, 7),
#             'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
#             'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),
#             'subsample': trial.suggest_float('subsample', 0.1, 1.0),
#             'gamma': trial.suggest_float('gamma', 0.1, 1.0, step=0.1),
#             'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#             'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.01),
#         }
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 7000, 10000, step=1000),
            'max_depth': trial.suggest_int('max_depth', 2,8,step=3),

        }

        
        rmse=0

        kf = model_selection.KFold(n_splits=KF_split_optuna, shuffle=True, random_state=random_state)

        for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):
        
            # Generating X and y for train and test sets
            X_train_f = X_train.iloc[train_idx].copy()
            y_train_f = y_train.iloc[train_idx]

            X_valid_f = X_train.iloc[valid_idx].copy()
            y_valid_f = y_train.iloc[valid_idx]

            X_test = df_test.copy()


            if model_name == 'model_1':
                # Encoding Categorical variables
                encoder = preprocessing.OrdinalEncoder()
                X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
                X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
                X_test[cat_col] = encoder.transform(X_test[cat_col])

                # Scaling Features
                scaler = preprocessing.StandardScaler()
                X_train_f[num_col] = scaler.fit_transform(X_train_f[num_col])
                X_valid_f[num_col] = scaler.transform(X_valid_f[num_col])
                X_test[num_col] = scaler.transform(X_test[num_col])


            elif model_name == 'model_2':
                # Encoding Categorical variables
                encoder = preprocessing.OrdinalEncoder()
                X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
                X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
                X_test[cat_col] = encoder.transform(X_test[cat_col])

                # Scaling Features
                scaler = preprocessing.StandardScaler()
                X_train_f[num_col] = scaler.fit_transform(X_train_f[num_col])
                X_valid_f[num_col] = scaler.transform(X_valid_f[num_col])
                X_test[num_col] = scaler.transform(X_test[num_col])
            

            elif model_name == 'model_3':
                # Encoding Categorical variables
                encoder = preprocessing.OrdinalEncoder()
                X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
                X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
                X_test[cat_col] = encoder.transform(X_test[cat_col])


            elif model_name == 'model_4':
                # Encoding Categorical variables
                encoder = preprocessing.OrdinalEncoder()
                X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
                X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
                X_test[cat_col] = encoder.transform(X_test[cat_col])    
            

            elif model_name == 'model_5':
                df_train_f = pd.concat([X_train_f,y_train_f], axis=1)

                for col in cat_col:
                    map_dict = df_train_f.groupby(col).mean().target.to_dict()
                    X_train_f[col] = X_train_f[col].map(map_dict)
                    X_valid_f[col] = X_valid_f[col].map(map_dict)
                    X_test[col] = X_test[col].map(map_dict)

            

            # Modeling 
            model = XGBRegressor(**params,
                                random_state=fold,
                                tree_method='gpu_hist',
                                gpu_id=0,
                                predictor='gpu_predictor')
        
            model.fit(X_train_f, y_train_f,
                    eval_set=[(X_valid_f,y_valid_f)],
                    early_stopping_rounds=300,
                    verbose=False)
            
            y_pred_f = model.predict(X_valid_f)
            rmse += metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
            return rmse
        
        
    # Optimizing
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=optuna_trials)

    print(f'Best score: {study.best_value:.5f}')
    print(f'Best Params: {study.best_params}')

#-----------------------------------------------------------------------------------------------

    # Fitting the tuned Model
    y_test_pred = []

    kf = model_selection.KFold(n_splits=KF_split, shuffle=True, random_state=random_state)

    for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):
        
        # Generating X and y for train and test sets
        X_train_f = X_train.iloc[train_idx].copy()
        y_train_f = y_train.iloc[train_idx]
        
        X_valid_f = X_train.iloc[valid_idx].copy()
        y_valid_f = y_train.iloc[valid_idx]
        
        X_test = df_test.copy()



        if model_name == 'model_1':
            # Encoding Categorical variables
            encoder = preprocessing.OrdinalEncoder()
            X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
            X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
            X_test[cat_col] = encoder.transform(X_test[cat_col])

            # Scaling Features
            scaler = preprocessing.StandardScaler()
            X_train_f[num_col] = scaler.fit_transform(X_train_f[num_col])
            X_valid_f[num_col] = scaler.transform(X_valid_f[num_col])
            X_test[num_col] = scaler.transform(X_test[num_col])


        elif model_name == 'model_2':
            # Encoding Categorical variables
            encoder = preprocessing.OrdinalEncoder()
            X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
            X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
            X_test[cat_col] = encoder.transform(X_test[cat_col])

            # Scaling Features
            scaler = preprocessing.StandardScaler()
            X_train_f[num_col] = scaler.fit_transform(X_train_f[num_col])
            X_valid_f[num_col] = scaler.transform(X_valid_f[num_col])
            X_test[num_col] = scaler.transform(X_test[num_col])


        elif model_name == 'model_3':
            # Encoding Categorical variables
            encoder = preprocessing.OrdinalEncoder()
            X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
            X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
            X_test[cat_col] = encoder.transform(X_test[cat_col])


        elif model_name == 'model_4':
            # Encoding Categorical variables
            encoder = preprocessing.OrdinalEncoder()
            X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
            X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
            X_test[cat_col] = encoder.transform(X_test[cat_col])    


        elif model_name == 'model_5':
            df_train_f = pd.concat([X_train_f,y_train_f], axis=1)

            for col in cat_col:
                map_dict = df_train_f.groupby(col).mean().target.to_dict()
                X_train_f[col] = X_train_f[col].map(map_dict)
                X_valid_f[col] = X_valid_f[col].map(map_dict)
                X_test[col] = X_test[col].map(map_dict)

        

        # Modeling
        model = XGBRegressor(**study.best_params,
                            random_state=fold,
                            tree_method='gpu_hist',
                            gpu_id=0,
                            predictor='gpu_predictor')
        
        model.fit(X_train_f, y_train_f,
                    eval_set=[(X_valid_f,y_valid_f)],
                    early_stopping_rounds=300,
                    verbose=False)
        
        y_pred_f = model.predict(X_valid_f)
        rmse = metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
        print(f'fold-{fold} rmse : {rmse:.5f}')
        
        y_test_f = model.predict(X_test)
        y_test_pred.append(y_test_f)
        
        # Updating Blending DataFrame
        y_valid_pred_blnd.loc[valid_idx, model_name] = y_pred_f
        
    y_test_model = np.mean(np.column_stack(y_test_pred), axis=1)
    y_test_pred_blnd.loc[:, model_name] = y_test_model


    #return y_valid_pred_blnd, y_test_pred_blnd


In [6]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

X_train = df_train.drop('target', axis=1)
y_train = df_train.target
X_test = df_test

num_col = [col for col in X_train.columns if 'cat' not in col]
cat_col = [col for col in X_train.columns if 'cat' in col]

optuna_tuning_fitting(model_name='model_1')

In [7]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

X_train = df_train.drop('target', axis=1)
y_train = df_train.target
X_test = df_test

num_col = [col for col in X_train.columns if 'cat' not in col]
cat_col = [col for col in X_train.columns if 'cat' in col]

optuna_tuning_fitting(model_name='model_2')


In [8]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

num_col = [col for col in df_train.columns if 'cont' in col]
cat_col = [col for col in df_train.columns if 'cat' in col]

X_train = df_train.drop('target', axis=1)
y_train = df_train.target
X_test = df_test

# Log tranformation
for col in num_col:
    X_train[col] = np.log1p(X_train[col])
    X_test[col] = np.log1p(X_test[col])
    
optuna_tuning_fitting(model_name='model_3')


In [9]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

X_train = df_train.drop('target', axis=1).copy()
y_train = df_train.target
X_test = df_test

num_col = [col for col in X_train.columns if 'cat' not in col]
cat_col = [col for col in X_train.columns if 'cat' in col]

# Polynomials
poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

train_poly = poly.fit_transform(X_train[num_col])
X_train_poly = pd.DataFrame(train_poly,
                            columns=[f'poly_{i}' for i in range(train_poly.shape[1])],
                            index=X_train.index) # using index for cancatenation
X_train = pd.concat([X_train[cat_col],X_train_poly], axis=1) # To avoid duplicating, we just concat cat_col with poly dataframe

test_poly = poly.fit_transform(df_test[num_col])
df_test_poly = pd.DataFrame(test_poly,
                            columns=[f'poly_{i}' for i in range(test_poly.shape[1])],
                            index=X_test.index)
df_test = pd.concat([X_test[cat_col],df_test_poly], axis=1)

optuna_tuning_fitting(model_name='model_4')


In [10]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

X_train = df_train.drop('target', axis=1).copy()
y_train = df_train.target
X_test = df_test

num_col = [col for col in X_train.columns if 'cat' not in col]
cat_col = [col for col in X_train.columns if 'cat' in col]
optuna_tuning_fitting(model_name='model_5')

In [11]:
y_valid_pred_blnd

In [12]:
y_valid_pred_blnd

In [13]:
y_test_pred_blnd

In [14]:
y_test_pred_blnd

In [15]:
a=y_valid_pred_blnd
b=y_test_pred_blnd

In [16]:
df sd fsd fsd fsdf 

## Model #1 

In [20]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

X_train = df_train.drop('target', axis=1)
y_train = df_train.target
X_test = df_test

num_col = [col for col in X_train.columns if 'cat' not in col]
cat_col = [col for col in X_train.columns if 'cat' in col]

In [21]:
# Optimizing Using Optuna

def objective(trial):
    
#         params = {
#             'n_estimators': trial.suggest_int('n_estimators', 100, 7000),
#             'learning_rate': trial.suggest_loguniform('learning_rate',0.005,0.5),
#             'max_depth': trial.suggest_int('max_depth', 1, 7),
#             'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
#             'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),
#             'subsample': trial.suggest_float('subsample', 0.1, 1.0),
#             'gamma': trial.suggest_float('gamma', 0.1, 1.0, step=0.1),
#             'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#             'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.01),
#         }

    params = {
            'n_estimators': trial.suggest_int('n_estimators', 7000, 10000, step=1000),
            'max_depth': trial.suggest_int('max_depth', 2,8,step=3),

        }
    
    
    rmse=0
    kf = model_selection.KFold(n_splits=KF_split_optuna, shuffle=True, random_state=random_state)

    for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):
    
        # Generating X and y for train and test sets
        X_train_f = X_train.iloc[train_idx].copy()
        y_train_f = y_train.iloc[train_idx]

        X_valid_f = X_train.iloc[valid_idx].copy()
        y_valid_f = y_train.iloc[valid_idx]

        X_test = df_test.copy()

        # Encoding Categorical variables
        encoder = preprocessing.OrdinalEncoder()
        X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
        X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
        X_test[cat_col] = encoder.transform(X_test[cat_col])

        # Scaling Features
        scaler = preprocessing.StandardScaler()
        X_train_f[num_col] = scaler.fit_transform(X_train_f[num_col])
        X_valid_f[num_col] = scaler.transform(X_valid_f[num_col])
        X_test[num_col] = scaler.transform(X_test[num_col])

        # Modeling 
        model = XGBRegressor(**params,
                             random_state=fold,
                             tree_method='gpu_hist',
                             gpu_id=0,
                             predictor='gpu_predictor')
    
        model.fit(X_train_f, y_train_f,
                  eval_set=[(X_valid_f,y_valid_f)],
                  early_stopping_rounds=300,
                  verbose=False)
        
        y_pred_f = model.predict(X_valid_f)
        rmse += metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
        return rmse
    
    
# Optimizing

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=optuna_trials)

print(f'Best score: {study.best_value:.5f}')
print(f'Best Params: {study.best_params}')

In [22]:
# Final Model
y_test_pred = []

kf = model_selection.KFold(n_splits=KF_split, shuffle=True, random_state=random_state)

for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):
    
    # Generating X and y for train and test sets
    X_train_f = X_train.iloc[train_idx].copy()
    y_train_f = y_train.iloc[train_idx]
    
    X_valid_f = X_train.iloc[valid_idx].copy()
    y_valid_f = y_train.iloc[valid_idx]
    
    X_test = df_test.copy()
    
    # Encoding Categorical variables
    encoder = preprocessing.OrdinalEncoder()
    X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
    X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
    X_test[cat_col] = encoder.transform(X_test[cat_col])
    
    # Scaling Features
    scaler = preprocessing.StandardScaler()
    X_train_f[num_col] = scaler.fit_transform(X_train_f[num_col])
    X_valid_f[num_col] = scaler.transform(X_valid_f[num_col])
    X_test[num_col] = scaler.transform(X_test[num_col])
    
    # Modeling
    model = XGBRegressor(**study.best_params,
                         random_state=fold,
                         tree_method='gpu_hist',
                         gpu_id=0,
                         predictor='gpu_predictor')
    
    model.fit(X_train_f, y_train_f,
                  eval_set=[(X_valid_f,y_valid_f)],
                  early_stopping_rounds=300,
                  verbose=False)
    
    y_pred_f = model.predict(X_valid_f)
    rmse = metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
    print(f'fold-{fold} rmse : {rmse:.5f}')
    
    y_test_f = model.predict(X_test)
    y_test_pred.append(y_test_f)
    
    # Updating Blending DataFrame
    y_valid_pred_blnd.loc[valid_idx,'model_1'] = y_pred_f
    
y_test_model = np.mean(np.column_stack(y_test_pred), axis=1)
y_test_pred_blnd.loc[:,'model_1'] = y_test_model

## Model #2

In [23]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

X_train = df_train.drop('target', axis=1)
y_train = df_train.target
X_test = df_test

num_col = [col for col in X_train.columns if 'cat' not in col]
cat_col = [col for col in X_train.columns if 'cat' in col]

In [24]:
# Optimizing Using Optuna

def objective(trial):
    
#         params = {
#             'n_estimators': trial.suggest_int('n_estimators', 100, 7000),
#             'learning_rate': trial.suggest_loguniform('learning_rate',0.005,0.5),
#             'max_depth': trial.suggest_int('max_depth', 1, 7),
#             'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
#             'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),
#             'subsample': trial.suggest_float('subsample', 0.1, 1.0),
#             'gamma': trial.suggest_float('gamma', 0.1, 1.0, step=0.1),
#             'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#             'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.01),
#         }

    params = {
            'n_estimators': trial.suggest_int('n_estimators', 7000, 10000, step=1000),
            'max_depth': trial.suggest_int('max_depth', 2,8,step=3),

        }
    
    rmse=0
    kf = model_selection.KFold(n_splits=KF_split_optuna, shuffle=True, random_state=random_state)

    for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):
    
        # Generating X and y for train and test sets
        X_train_f = X_train.iloc[train_idx].copy()
        y_train_f = y_train.iloc[train_idx]

        X_valid_f = X_train.iloc[valid_idx].copy()
        y_valid_f = y_train.iloc[valid_idx]

        X_test = df_test.copy()

        # Encoding Categorical variables
        encoder = preprocessing.OrdinalEncoder()
        X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
        X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
        X_test[cat_col] = encoder.transform(X_test[cat_col])

        # Scaling Features
        scaler = preprocessing.StandardScaler()
        X_train_f[num_col] = scaler.fit_transform(X_train_f[num_col])
        X_valid_f[num_col] = scaler.transform(X_valid_f[num_col])
        X_test[num_col] = scaler.transform(X_test[num_col])

        # Modeling 
        model = XGBRegressor(**params,
                             random_state=fold,
                             tree_method='gpu_hist',
                             gpu_id=0,
                             predictor='gpu_predictor')
    
        model.fit(X_train_f, y_train_f,
                  eval_set=[(X_valid_f,y_valid_f)],
                  early_stopping_rounds=300,
                  verbose=False)
        
        y_pred_f = model.predict(X_valid_f)
        rmse += metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
        return rmse
    
    
# Optimizing
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=optuna_trials)

print(f'Best score: {study.best_value:.5f}')
print(f'Best Params: {study.best_params}')

In [25]:
# Final Model
y_test_pred = []

kf = model_selection.KFold(n_splits=KF_split, shuffle=True, random_state=random_state)

for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):
    
    # Generating X and y for train and test sets
    X_train_f = X_train.iloc[train_idx].copy()
    y_train_f = y_train.iloc[train_idx]
    
    X_valid_f = X_train.iloc[valid_idx].copy()
    y_valid_f = y_train.iloc[valid_idx]
    
    X_test = df_test.copy()
    
    # Encoding Categorical variables
    encoder = preprocessing.OrdinalEncoder()
    X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
    X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
    X_test[cat_col] = encoder.transform(X_test[cat_col])
    
    # Scaling Features
    scaler = preprocessing.StandardScaler()
    X_train_f[num_col] = scaler.fit_transform(X_train_f[num_col])
    X_valid_f[num_col] = scaler.transform(X_valid_f[num_col])
    X_test[num_col] = scaler.transform(X_test[num_col])
    
    # Modeling
    model = XGBRegressor(**study.best_params,
                         random_state=fold,
                         tree_method='gpu_hist',
                         gpu_id=0,
                         predictor='gpu_predictor')
    
    model.fit(X_train_f, y_train_f,
                  eval_set=[(X_valid_f,y_valid_f)],
                  early_stopping_rounds=300,
                  verbose=False)
    
    y_pred_f = model.predict(X_valid_f)
    rmse = metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
    print(f'fold-{fold} rmse : {rmse:.5f}')
    
    y_test_f = model.predict(X_test)
    y_test_pred.append(y_test_f)
    
    # Updating Blending DataFrame
    y_valid_pred_blnd.loc[valid_idx,'model_2'] = y_pred_f
    
y_test_model = np.mean(np.column_stack(y_test_pred), axis=1)
y_test_pred_blnd.loc[:,'model_2'] = y_test_model

## Model #3

In [26]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

num_col = [col for col in df_train.columns if 'cont' in col]
cat_col = [col for col in df_train.columns if 'cat' in col]

X_train = df_train.drop('target', axis=1)
y_train = df_train.target
X_test = df_test

# Log tranformation
for col in num_col:
    X_train[col] = np.log1p(X_train[col])
    X_test[col] = np.log1p(X_test[col])

In [27]:
# Optimizing Using Optuna

def objective(trial):
    
#         params = {
#             'n_estimators': trial.suggest_int('n_estimators', 100, 7000),
#             'learning_rate': trial.suggest_loguniform('learning_rate',0.005,0.5),
#             'max_depth': trial.suggest_int('max_depth', 1, 7),
#             'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
#             'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),
#             'subsample': trial.suggest_float('subsample', 0.1, 1.0),
#             'gamma': trial.suggest_float('gamma', 0.1, 1.0, step=0.1),
#             'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#             'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.01),
#         }

    params = {
            'n_estimators': trial.suggest_int('n_estimators', 7000, 10000, step=1000),
            'max_depth': trial.suggest_int('max_depth', 2,8,step=3),

        }
    
    rmse=0
    kf = model_selection.KFold(n_splits=KF_split, shuffle=True, random_state=random_state)

    for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):

        # Generating X and y for train and test sets
        X_train_f = X_train.iloc[train_idx].copy()
        y_train_f = y_train.iloc[train_idx]

        X_valid_f = X_train.iloc[valid_idx].copy()
        y_valid_f = y_train.iloc[valid_idx]

        X_test = df_test.copy()

        # Encoding Categorical variables
        encoder = preprocessing.OrdinalEncoder()
        X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
        X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
        X_test[cat_col] = encoder.transform(X_test[cat_col])

        # Modeling 
        model = XGBRegressor(**params,
                             random_state=fold,
                             tree_method='gpu_hist',
                             gpu_id=0,
                             predictor='gpu_predictor')
    
        model.fit(X_train_f, y_train_f,
                  eval_set=[(X_valid_f,y_valid_f)],
                  early_stopping_rounds=300,
                  verbose=False)
        
        y_pred_f = model.predict(X_valid_f)
        rmse += metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
        return rmse
    
    
# Optimizing
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=optuna_trials)

print(f'Best score: {study.best_value:.5f}')
print(f'Best Params: {study.best_params}')

In [28]:
# Final Model
y_test_pred = []

kf = model_selection.KFold(n_splits=KF_split, shuffle=True, random_state=random_state)

for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):
    
    # Generating X and y for train and test sets
    X_train_f = X_train.iloc[train_idx].copy()
    y_train_f = y_train.iloc[train_idx]
    
    X_valid_f = X_train.iloc[valid_idx].copy()
    y_valid_f = y_train.iloc[valid_idx]
    
    X_test = df_test.copy()
    
    # Encoding Categorical variables
    encoder = preprocessing.OrdinalEncoder()
    X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
    X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
    X_test[cat_col] = encoder.transform(X_test[cat_col])
    
    # Modeling
    model = XGBRegressor(**study.best_params,
                         random_state=fold,
                         tree_method='gpu_hist',
                         gpu_id=0,
                         predictor='gpu_predictor')
    
    model.fit(X_train_f, y_train_f,
                  eval_set=[(X_valid_f,y_valid_f)],
                  early_stopping_rounds=300,
                  verbose=False)
    
    y_pred_f = model.predict(X_valid_f)
    rmse = metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
    print(f'fold-{fold} rmse : {rmse:.5f}')
    
    y_test_f = model.predict(X_test)
    y_test_pred.append(y_test_f)
    
    # Updating Blending DataFrame
    y_valid_pred_blnd.loc[valid_idx,'model_3'] = y_pred_f
    
y_test_model = np.mean(np.column_stack(y_test_pred), axis=1)
y_test_pred_blnd.loc[:,'model_3'] = y_test_model

## Model #4

In [29]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

X_train = df_train.drop('target', axis=1).copy()
y_train = df_train.target
X_test = df_test

num_col = [col for col in X_train.columns if 'cat' not in col]
cat_col = [col for col in X_train.columns if 'cat' in col]

# Polynomials
poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

train_poly = poly.fit_transform(X_train[num_col])
X_train_poly = pd.DataFrame(train_poly,
                            columns=[f'poly_{i}' for i in range(train_poly.shape[1])],
                            index=X_train.index) # using index for cancatenation
X_train = pd.concat([X_train[cat_col],X_train_poly], axis=1) # To avoid duplicating, we just concat cat_col with poly dataframe

test_poly = poly.fit_transform(df_test[num_col])
df_test_poly = pd.DataFrame(test_poly,
                            columns=[f'poly_{i}' for i in range(test_poly.shape[1])],
                            index=X_test.index)
df_test = pd.concat([X_test[cat_col],df_test_poly], axis=1)

In [30]:
# Optimizing Using Optuna

def objective(trial):
    
#         params = {
#             'n_estimators': trial.suggest_int('n_estimators', 100, 7000),
#             'learning_rate': trial.suggest_loguniform('learning_rate',0.005,0.5),
#             'max_depth': trial.suggest_int('max_depth', 1, 7),
#             'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
#             'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),
#             'subsample': trial.suggest_float('subsample', 0.1, 1.0),
#             'gamma': trial.suggest_float('gamma', 0.1, 1.0, step=0.1),
#             'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#             'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.01),
#         }

    params = {
            'n_estimators': trial.suggest_int('n_estimators', 7000, 10000, step=1000),
            'max_depth': trial.suggest_int('max_depth', 2,8,step=3),

        }
    
    
    rmse=0
    kf = model_selection.KFold(n_splits=KF_split, shuffle=True, random_state=random_state)

    for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):

        # Generating X and y for train and test sets
        X_train_f = X_train.iloc[train_idx].copy()
        y_train_f = y_train.iloc[train_idx]

        X_valid_f = X_train.iloc[valid_idx].copy()
        y_valid_f = y_train.iloc[valid_idx]

        X_test = df_test.copy()

        # Encoding Categorical variables
        encoder = preprocessing.OrdinalEncoder()
        X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
        X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
        X_test[cat_col] = encoder.transform(X_test[cat_col])

        # Modeling 
        model = XGBRegressor(**params,
                             random_state=fold,
                             tree_method='gpu_hist',
                             gpu_id=0,
                             predictor='gpu_predictor')
    
        model.fit(X_train_f, y_train_f,
                  eval_set=[(X_valid_f,y_valid_f)],
                  early_stopping_rounds=300,
                  verbose=False)
        
        y_pred_f = model.predict(X_valid_f)
        rmse += metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
        return rmse
    
    
# Optimizing
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=optuna_trials)

print(f'Best score: {study.best_value:.5f}')
print(f'Best Params: {study.best_params}')

In [31]:
# Final Model
y_test_pred = []

kf = model_selection.KFold(n_splits=KF_split, shuffle=True, random_state=random_state)

for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):
    
    # Generating X and y for train and test sets
    X_train_f = X_train.iloc[train_idx].copy()
    y_train_f = y_train.iloc[train_idx]
    
    X_valid_f = X_train.iloc[valid_idx].copy()
    y_valid_f = y_train.iloc[valid_idx]
    
    X_test = df_test.copy()
    
    # Encoding Categorical variables
    encoder = preprocessing.OrdinalEncoder()
    X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
    X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
    X_test[cat_col] = encoder.transform(X_test[cat_col])
    
    # Modeling
    model = XGBRegressor(**study.best_params,
                         random_state=fold,
                         tree_method='gpu_hist',
                         gpu_id=0,
                         predictor='gpu_predictor')
    
    model.fit(X_train_f, y_train_f,
                  eval_set=[(X_valid_f,y_valid_f)],
                  early_stopping_rounds=300,
                  verbose=False)
    
    y_pred_f = model.predict(X_valid_f)
    rmse = metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
    print(f'fold-{fold} rmse : {rmse:.5f}')
    
    y_test_f = model.predict(X_test)
    y_test_pred.append(y_test_f)
    
    # Updating Blending DataFrame
    y_valid_pred_blnd.loc[valid_idx,'model_4'] = y_pred_f
    
y_test_model = np.mean(np.column_stack(y_test_pred), axis=1)
y_test_pred_blnd.loc[:,'model_4'] = y_test_model

## Model #5

In [32]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

X_train = df_train.drop('target', axis=1)
y_train = df_train.target
X_test = df_test

num_col = [col for col in X_train.columns if 'cat' not in col]
cat_col = [col for col in X_train.columns if 'cat' in col]

In [33]:
# Optimizing Using Optuna

def objective(trial):
    
#         params = {
#             'n_estimators': trial.suggest_int('n_estimators', 100, 7000),
#             'learning_rate': trial.suggest_loguniform('learning_rate',0.005,0.5),
#             'max_depth': trial.suggest_int('max_depth', 1, 7),
#             'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
#             'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),
#             'subsample': trial.suggest_float('subsample', 0.1, 1.0),
#             'gamma': trial.suggest_float('gamma', 0.1, 1.0, step=0.1),
#             'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#             'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.01),
#         }

    params = {
            'n_estimators': trial.suggest_int('n_estimators', 7000, 10000, step=1000),
            'max_depth': trial.suggest_int('max_depth', 2,8,step=3),

        }
    
    rmse=0
    kf = model_selection.KFold(n_splits=KF_split, shuffle=True, random_state=random_state)

    for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):

        # Generating X and y for train train_idxd test sets
        X_train_f = X_train.iloc[train_idx].copy()
        y_train_f = y_train.iloc[train_idx]

        X_valid_f = X_train.iloc[valid_idx].copy()
        y_valid_f = y_train.iloc[valid_idx]

        X_test = df_test.copy()

        df_train_f = pd.concat([X_train_f,y_train_f], axis=1)

        for col in cat_col:
            map_dict = df_train_f.groupby(col).mean().target.to_dict()
            X_train_f[col] = X_train_f[col].map(map_dict)
            X_valid_f[col] = X_valid_f[col].map(map_dict)
            X_test[col] = X_test[col].map(map_dict)

        # Modeling 
        model = XGBRegressor(**params,
                             random_state=fold,
                             tree_method='gpu_hist',
                             gpu_id=0,
                             predictor='gpu_predictor')
    
        model.fit(X_train_f, y_train_f,
                  eval_set=[(X_valid_f,y_valid_f)],
                  early_stopping_rounds=300,
                  verbose=False)
        
        y_pred_f = model.predict(X_valid_f)
        rmse += metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
        return rmse
    
    
# Optimizing
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=optuna_trials)

print(f'Best score: {study.best_value:.5f}')
print(f'Best Params: {study.best_params}')

In [34]:
# Final Model
y_test_pred = []

kf = model_selection.KFold(n_splits=KF_split, shuffle=True, random_state=random_state)

for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):

    # Generating X and y for train train_idxd test sets
    X_train_f = X_train.iloc[train_idx].copy()
    y_train_f = y_train.iloc[train_idx]
    
    X_valid_f = X_train.iloc[valid_idx].copy()
    y_valid_f = y_train.iloc[valid_idx]
    
    X_test = df_test.copy()
    
    df_train_f = pd.concat([X_train_f,y_train_f], axis=1)
    
    for col in cat_col:
        map_dict = df_train_f.groupby(col).mean().target.to_dict()
        X_train_f[col] = X_train_f[col].map(map_dict)
        X_valid_f[col] = X_valid_f[col].map(map_dict)
        X_test[col] = X_test[col].map(map_dict)
    
    # Modeling
    model = XGBRegressor(**study.best_params,
                         random_state=fold,
                         tree_method='gpu_hist',
                         gpu_id=0,
                         predictor='gpu_predictor')
    
    model.fit(X_train_f, y_train_f,
                  eval_set=[(X_valid_f,y_valid_f)],
                  early_stopping_rounds=300,
                  verbose=False)
    
    y_pred_f = model.predict(X_valid_f)
    rmse = metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
    print(f'fold-{fold} rmse : {rmse:.5f}')
    
    y_test_f = model.predict(X_test)
    y_test_pred.append(y_test_f)
    
    # Updating Blending DataFrame
    y_valid_pred_blnd.loc[valid_idx,'model_5'] = y_pred_f
    
y_test_model = np.mean(np.column_stack(y_test_pred), axis=1)
y_test_pred_blnd.loc[:,'model_5'] = y_test_model

# Stacking

## Blending #1

In [None]:
X_train_blnd = y_valid_pred_blnd.drop(['id','target'], axis=1)
y_train_blnd = y_valid_pred_blnd.target

X_test_blnd = y_test_pred_blnd

In [None]:
# Optimizing Using Optuna

def objective(trial):
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 7000),
        'learning_rate': trial.suggest_loguniform('learning_rate',0.005,0.5),
        'max_depth': trial.suggest_int('max_depth', 1, 7),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 100.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 100.0),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'gamma': trial.suggest_float('gamma', 0.1, 1.0, step=0.1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.01),
    }
    
    
    rmse=0
    kf = model_selection.KFold(n_splits=KF_split_optuna, shuffle=True, random_state=random_state)

    for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train_blnd)):

        X_train_f = X_train_blnd.iloc[train_idx].copy()
        y_train_f = y_train_blnd.iloc[train_idx]

        X_valid_f = X_train_blnd.iloc[valid_idx].copy()
        y_valid_f = y_train_blnd.iloc[valid_idx]

        X_test = X_test_blnd.copy()
    
    
        # Modeling 
        model = XGBRegressor(**params,
                             random_state=fold,
                             tree_method='gpu_hist',
                             gpu_id=0,
                             predictor='gpu_predictor')
    
        model.fit(X_train_f, y_train_f,
                  eval_set=[(X_valid_f,y_valid_f)],
                  early_stopping_rounds=300,
                  verbose=False)
        
        y_pred_f = model.predict(X_valid_f)
        rmse += metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
        return rmse
    
    
# Optimizing
# optuna.logging.set_verbosity(optuna.logging.WARNING) #stop showing each trial result

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=optuna_trials)

print(f'Best score: {study.best_value:.5f}')
print(f'Best Params: {study.best_params}')

In [None]:
# Final Model
y_test_pred = []

kf = model_selection.KFold(n_splits=KF_split, shuffle=True, random_state=random_state)

for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train_blnd)):
    
    # Generating X and y for train and test sets
    X_train_f = X_train_blnd.iloc[train_idx].copy()
    y_train_f = y_train_blnd.iloc[train_idx]
    
    X_valid_f = X_train_blnd.iloc[valid_idx].copy()
    y_valid_f = y_train_blnd.iloc[valid_idx]
    
    X_test = X_test_blnd.copy()
    
    # Modeling
    model = XGBRegressor(**study.best_params,
                         random_state=fold,
                         tree_method='gpu_hist',
                         gpu_id=0,
                         predictor='gpu_predictor')
    
    model.fit(X_train_f, y_train_f,
              eval_set=[(X_valid_f, y_valid_f)],
              early_stopping_rounds=300,
              verbose=0)
    
    y_pred_f = model.predict(X_valid_f)
    rmse = metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
    print(f'fold-{fold} rmse : {rmse:.5f}')
    
    y_test_f = model.predict(X_test)
    y_test_pred.append(y_test_f)
    
#     # Updating Blending DataFrame
#     y_valid_pred_blnd.loc[valid_idx,'model_1'] = y_pred_f
    
# y_test_model = np.mean(np.column_stack(y_test_pred), axis=1)
# y_test_pred_blnd.loc[:,'model_1'] = y_test_model

In [None]:
dr r  r 

In [None]:
y_test_pred = []

kf = model_selection.KFold(n_splits=KF_split, shuffle=True, random_state=random_state)

for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train_blnd)):
    
    X_train_f = X_train_blnd.iloc[train_idx].copy()
    y_train_f = y_train_blnd.iloc[train_idx]
    
    X_valid_f = X_train_blnd.iloc[valid_idx].copy()
    y_valid_f = y_train_blnd.iloc[valid_idx]
    
    X_test = X_test_blnd.copy()
    
    # Modeling
    model = XGBRegressor(random_state=fold,
                         tree_method='gpu_hist',
                         gpu_id=0,
                         predictor='gpu_predictor')
    
    model.fit(X_train_f, y_train_f,
              eval_set=[(X_valid_f, y_valid_f)],
              early_stopping_rounds=300,
              verbose=0)
    
    y_pred_f = model.predict(X_valid_f)
    rmse = metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
    print(f'fold-{fold} rmse : {rmse:.5f}')
    
    y_test_f = model.predict(X_test)
    y_test_pred.append(y_test_f)