In [None]:
import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

from IPython.display import display  # Allows the use of display() for DataFrames

import warnings

warnings.filterwarnings('ignore')

## Importing training and test datasets


In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

## Checking missing values

In [None]:
#### Checking for NULL values in training data
print("Total Train Features with NaN Values = " + str(train_df.columns[train_df.isnull().sum() != 0].size))
if (train_df.columns[train_df.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(train_df.columns[train_df.isnull().sum() != 0])))
    train_df[train_df.columns[train_df.isnull().sum() != 0]].isnull().sum().sort_values(ascending=False)

In [None]:
# check and delete columns (attributes) with all null values
colsToRemove = []
for col in train_df.columns:
    if col != 'ID' and col != 'target':
        if train_df[col].std() == 0:
            colsToRemove.append(col)

# remove null columns from the training set
train_df.drop(colsToRemove, axis=1, inplace=True)

# remove null columns in the test set
test_df.drop(colsToRemove, axis=1, inplace=True)

print("Removed `{}` Constant Columns\n".format(len(colsToRemove)))
print(colsToRemove)

In [None]:
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:, i].values
            for j in range(i + 1, lcs):
                ja = vs.iloc[:, j].values
                if np.array_equal(ia, ja):
                    dups.append(cs[i])
                    break

    return dups


colsToRemove = duplicate_columns(train_df)
print(colsToRemove)

In [None]:
# Removing duplicate columns in the training network
train_df.drop(colsToRemove, axis=1, inplace=True)

# Removing duplicate columns in the test grid
test_df.drop(colsToRemove, axis=1, inplace=True)

print("Removed `{}` Duplicate Columns\n".format(len(colsToRemove)))
print(colsToRemove)

## Delete sparse data, check each attribute and if it has rows with records less than 2, then delete that attribute.

In [None]:
def drop_sparse(train, test):
    flist = [x for x in train.columns if
             not x in ['ID', 'target']]
    for f in flist:
        if len(np.unique(
                train[f])) < 2:
            train.drop(f, axis=1, inplace=True)
            test.drop(f, axis=1, inplace=True)
    return train, test

In [None]:
train_df, test_df = drop_sparse(train_df, test_df)

In [None]:
gc.collect()

In [None]:
print("Train set size: {}".format(train_df.shape))
print("Test set size: {}".format(test_df.shape))

In [None]:
X = train_df.drop(["ID", "target"], axis=1)
y = np.log1p(train_df["target"].values)

X_test_predict = test_df.drop(["ID"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.transform(X_test_predict)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

# LGBM

In [None]:
lgb = LGBMRegressor(objective='regression', num_leaves=100, learning_rate=0.001, bagging_fraction=0.6,
                    feature_fraction=0.6, bagging_frequency=6, bagging_seed=42, verbosity=-1, random_state=17,
                    n_jobs=-1, metric='rmse', n_estimators=5000,seed = 42)

In [None]:
lgb.fit(X_train_scaler, y_train, eval_metric='rmse', eval_set=[(X_test_scaler, y_test)], early_stopping_rounds=100,verbose=150)

In [None]:
preds_lgb_model = lgb.predict(X_test_scaler)

In [None]:
rmse_lgb = np.sqrt(mean_squared_error(y_test, preds_lgb_model))
print(" RMSE: %f" % (rmse_lgb))

In [None]:
lgb_importance = pd.DataFrame(lgb.feature_importances_, test_df.columns[:-1], columns=['features'])
lgb_importance.sort_values(by='features', ascending=False)

# XGBRegressor

In [None]:
xgb = XGBRegressor(objective='reg:linear',
                   eval_metric='rmse',
                   eta=0.001,
                   max_depth=10,
                   subsample=0.6,
                   colsample_bytree=0.6,
                   alpha=0.001,
                   random_state=17,
                   silent=True,
                   n_estimators=5000,
                   n_jobs=-1,
                   seed=17)

In [None]:
xgb.fit(X_train_scaler, y_train,early_stopping_rounds=100,eval_set=[(X_test_scaler,y_test)],verbose=50)

In [None]:
preds_lgb_model = xgb.predict(X_test_scaler)
rmse_lgb = np.sqrt(mean_squared_error(y_test, preds_lgb_model))
print(" RMSE: %f" % (rmse_lgb))

## CatBoostRegressor

In [None]:
cb_model = CatBoostRegressor(iterations=600,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 17,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)

In [None]:
cb_model.fit(X_train_scaler,y_train,verbose=50,eval_set=(X_test_scaler, y_test),early_stopping_rounds=10,use_best_model=True, plot=True)

In [None]:
pred_test_LGBM = np.expm1(lgb.predict(X_test_predict))
pred_test_XGB = np.expm1(xgb.predict(X_test_predict))
pred_test_cat = np.expm1(cb_model.predict(X_test_predict))

In [None]:
sub = pd.read_csv('sample_submission.csv')

In [None]:
sub_lgb = pd.DataFrame()
sub_lgb["target"] = pred_test_LGBM

sub_xgb = pd.DataFrame()
sub_xgb["target"] = pred_test_XGB

sub_cat = pd.DataFrame()
sub_cat["target"] = pred_test_cat

In [None]:
sub["target"] = (sub_lgb["target"] * 0.5 + sub_xgb["target"] * 0.3 + sub_cat["target"] * 0.2)

In [None]:
print(sub.head())
sub.to_csv('sub_lgb_xgb_cat.csv', index=False)