In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics

from xgboost import XGBRegressor

In [2]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

In [3]:
df_train.head()

In [4]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

X_train = df_train.drop('target', axis=1)
y_train = df_train.target
X_test = df_test

num_col = [col for col in X_train.columns if 'cat' not in col]
cat_col = [col for col in X_train.columns if 'cat' in col]

y_test_pred = []

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):
    
    # Generating X and y for train and test sets
    X_train_f = X_train.iloc[train_idx].copy()
    y_train_f = y_train.iloc[train_idx]
    
    X_valid_f = X_train.iloc[valid_idx].copy()
    y_valid_f = y_train.iloc[valid_idx]
    
    X_test = df_test.copy()
    
    # Encoding Categorical variables
    encoder = preprocessing.OrdinalEncoder()
    X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
    X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
    X_test[cat_col] = encoder.transform(X_test[cat_col])
    
    # Scaling Features
    scaler = preprocessing.StandardScaler()
    X_train_f[num_col] = scaler.fit_transform(X_train_f[num_col])
    X_valid_f[num_col] = scaler.transform(X_valid_f[num_col])
    X_test[num_col] = scaler.transform(X_test[num_col])
    
    # Modeling
    model = XGBRegressor(random_state=fold,
                         tree_method='gpu_hist',
                         gpu_id=0,
                         predictor='gpu_predictor')
    model.fit(X_train_f, y_train_f)
    
    y_pred_f = model.predict(X_valid_f)
    rmse = metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
    print(f'fold-{fold} rmse : {rmse:.5f}')
    
    y_test_f = model.predict(X_test)
    y_test_pred.append(y_test_f)
    

In [5]:
# Converting list of list to arraywith column_stack
#y_test_pred = np.mean(np.column_stack(y_test_pred), axis=1)

In [6]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

X_train = df_train.drop('target', axis=1)
y_train = df_train.target
X_test = df_test

num_col = [col for col in X_train.columns if 'cat' not in col]
cat_col = [col for col in X_train.columns if 'cat' in col]

# Log tranformation
for col in num_col:
    df_train[col] = np.log1p(df_train[col])
    df_test[col] = np.log1p(df_test[col])


kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):
    
    # Generating X and y for train and test sets
    X_train_f = X_train.iloc[train_idx].copy()
    y_train_f = y_train.iloc[train_idx]
    
    X_valid_f = X_train.iloc[valid_idx].copy()
    y_valid_f = y_train.iloc[valid_idx]
    
    X_test = df_test.copy()
    
    # Encoding Categorical variables
    encoder = preprocessing.OrdinalEncoder()
    X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
    X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
    X_test[cat_col] = encoder.transform(X_test[cat_col])
    
    # Modeling
    model = XGBRegressor(random_state=fold,
                         tree_method='gpu_hist',
                         gpu_id=0,
                         predictor='gpu_predictor')
    model.fit(X_train_f, y_train_f)
    
    y_pred_f = model.predict(X_valid_f)
    rmse = metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
    print(f'fold-{fold} rmse : {rmse:.5f}')
    
    y_test_f = model.predict(X_test)
    y_test_pred.append(y_test_f)
    

In [9]:
df_train = pd.read_csv('../input/30-days-of-ml/train.csv', index_col=0)
df_test = pd.read_csv('../input/30-days-of-ml/test.csv', index_col=0)

X_train = df_train.drop('target', axis=1).copy()
y_train = df_train.target
X_test = df_test

num_col = [col for col in X_train.columns if 'cat' not in col]
cat_col = [col for col in X_train.columns if 'cat' in col]

# Polynomials
poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

train_poly = poly.fit_transform(X_train[num_col])
X_train_poly = pd.DataFrame(train_poly,
                            columns=[f'poly_{i}' for i in range(train_poly.shape[1])],
                            index=X_train.index) # using index for cancatenation
X_train = pd.concat([X_train[cat_col],X_train_poly], axis=1) # To avoid duplicating, we just concat cat_col with poly dataframe

test_poly = poly.fit_transform(df_test[num_col])
df_test_poly = pd.DataFrame(test_poly,
                            columns=[f'poly_{i}' for i in range(test_poly.shape[1])],
                            index=X_test.index)
df_test = pd.concat([X_test[cat_col],df_test_poly], axis=1)


# KFold
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx,valid_idx) in enumerate(kf.split(X_train)):
    
    # Generating X and y for train and test sets
    X_train_f = X_train.iloc[train_idx].copy()
    y_train_f = y_train.iloc[train_idx]
    
    X_valid_f = X_train.iloc[valid_idx].copy()
    y_valid_f = y_train.iloc[valid_idx]
    
    X_test = df_test.copy()
    
    # Encoding Categorical variables
    encoder = preprocessing.OrdinalEncoder()
    X_train_f[cat_col] = encoder.fit_transform(X_train_f[cat_col]) #cat_col
    X_valid_f[cat_col] = encoder.transform(X_valid_f[cat_col])
    X_test[cat_col] = encoder.transform(X_test[cat_col])
    
    # Modeling
    model = XGBRegressor(random_state=fold,
                         tree_method='gpu_hist',
                         gpu_id=0,
                         predictor='gpu_predictor')
    model.fit(X_train_f, y_train_f)
    
    y_pred_f = model.predict(X_valid_f)
    rmse = metrics.mean_squared_error(y_pred_f, y_valid_f, squared=False)
    print(f'fold-{fold} rmse : {rmse:.5f}')
    
    y_test_f = model.predict(X_test)
    y_test_pred.append(y_test_f)
    