# Attempt 1

* NaNs replaced by zeros
* Using QuantileTransformer on target
* Using random 80/20 train/test split
* XGBoost on RF using parameters from GridSearch

In [7]:
import numpy as np
import pandas as pd
import matplotlib
import math
from sklearn import model_selection, ensemble, metrics, linear_model, preprocessing
from matplotlib import pyplot as plt
%matplotlib inline

In [8]:
data = pd.read_csv('dataset/train.csv')

In [9]:
test = pd.read_csv('dataset/test.csv')

In [10]:
#Test dataset contains objects only of latest 10 years,
#including unmentioned in train data latest year 1016064
unique_years = data['galactic year'].unique()
print (unique_years)
print (unique_years.shape)

print(np.sort(test['galactic year'].unique()))
print(test['galactic year'].unique().shape)

unique_names = data['galaxy'].unique()
unique_names = np.sort(unique_names)
print (unique_names[:5])
print(unique_names.shape)

[ 990025  991020  992016  993012  994009  995006  996004  997002  998001
  999000 1000000 1001000 1002001 1003002 1004004 1005006 1006009 1007012
 1008016 1009020 1010025 1011030 1012036 1013042 1014049 1015056]
(26,)
[1007012 1008016 1009020 1010025 1011030 1012036 1013042 1014049 1015056
 1016064]
(10,)
['Andromeda Galaxy (M31)' 'Andromeda I' 'Andromeda II' 'Andromeda III'
 'Andromeda IX']
(181,)


In [11]:
#Let's map all galactic years to years with increment of 1 year
# and map all galaxies to their unique integer key

#dictionary for replacement of galactic years with normal years
di = {val: ind+1 for ind, val in enumerate(np.append(unique_years,
                                                     1016064))}
#dictionary for replacement of names with integer keys
di_names = {val: ind+1 for ind, val in enumerate(unique_names)}
data.replace({'galactic year':di}, inplace=True)
test.replace({'galactic year':di}, inplace=True)

data.replace({'galaxy':di_names}, inplace=True)
test.replace({'galaxy':di_names}, inplace=True)

In [74]:
from sklearn.model_selection import cross_val_score as CVS
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error as MSE
from sklearn.compose import TransformedTargetRegressor as TransTargReg
from sklearn.impute import SimpleImputer

# getting X and y
X = data.iloc[:, 2:-1].values
y = data.iloc[:, -1].values

#creating a target transformer (normal)
qt = QuantileTransformer(n_quantiles=1000, output_distribution='normal' )

#normalize
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)

(X_train, X_test, y_train,
 y_test) = model_selection.train_test_split(X, y,
          test_size = 0.2, shuffle=True, random_state=425)

In [104]:
#Let's use RandomForest
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=2000,
                               criterion='mse',
                               max_depth=None,
                               min_samples_split=2,
                               min_samples_leaf=2,
                               max_features=26,
                               n_jobs=3,
                               min_weight_fraction_leaf=0.0,
                               max_leaf_nodes=None,
                               min_impurity_decrease=0.0,
                               min_impurity_split=None,
                               bootstrap=True,
                               oob_score=False,
                               random_state=None,verbose=0,
                               warm_start=False,
                               ccp_alpha=0.0,
                               max_samples=None,)
forest.fit(X_train, y_train)

print(MSE(y_train, forest.predict(X_train))**0.5)
print(MSE(y_test, forest.predict(X_test))**0.5)

0.012996969418034781
0.027789754076831437


In [76]:
#print(forest.feature_importances_)

In [99]:
import xgboost as xgb
XGB = TransTargReg(xgb.XGBRegressor(
    max_depth=6,
    learning_rate=0.01,
    n_estimators=2000,
    verbosity=1,
    silent=None,
    objective='reg:squarederror',
    booster='gbtree',
    n_jobs=3,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.5,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=0.3,
    reg_alpha=1,
    reg_lambda=0,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=42,
    seed=None,
    missing=None,
    importance_type='gain'),
                   qt)
XGB.fit(X_train, y_train)

print(MSE(y_train, XGB.predict(X_train))**0.5)
print(MSE(y_test, XGB.predict(X_test))**0.5)

0.012228787669184718
0.027486190084097663


In [91]:
np.linspace(0.2, 1, 5)

array([0.2, 0.4, 0.6, 0.8, 1. ])

In [92]:
#GridSearch for XGBoost RF hyperparameters
param_grid = {
    'regressor__max_depth': [4,5,6,7,8,9],
    'regressor__learning_rate': [0.1, 0.01, 0.001],
#    'regressor__n_estimators': [200, 400, 800, 1200],
    'regressor__subsample': list(np.linspace(0.2, 1, 5)),
#    'regressor__colsample_bynode': [0.2, 0.3, 0.4],
    'regressor__reg_alpha': [0,1],
    'regressor__reg_lambda': [0,1],
    
}
XGB_GSCV = model_selection.GridSearchCV(XGB,
                                        param_grid=param_grid,
                                        n_jobs=None,
                                        scoring='neg_root_mean_squared_error',
                                        cv=10)
XGB_GSCV.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=TransformedTargetRegressor(check_inverse=True, func=None,
                                                  inverse_func=None,
                                                  regressor=XGBRegressor(base_score=0.5,
                                                                         booster='gbtree',
                                                                         colsample_bylevel=1,
                                                                         colsample_bynode=0.3,
                                                                         colsample_bytree=1,
                                                                         gamma=0,
                                                                         importance_type='gain',
                                                                         learning_rate=0.01,
                                                                         max_de

In [97]:
XGB_GSCV.best_params_

{'regressor__learning_rate': 0.01,
 'regressor__max_depth': 6,
 'regressor__reg_alpha': 0,
 'regressor__reg_lambda': 0,
 'regressor__subsample': 1.0}

In [96]:
MSE(y_test, XGB_GSCV.best_estimator_.predict(X_test))**0.5

0.027827222929303935