In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

  from numpy.core.umath_tests import inner1d


In [3]:
target_type = CategoricalDtype(['class_06', 'class_15', 'class_16', 'class_42', 'class_52', 'class_53', 'class_62',
                                'class_64', 'class_65', 'class_67', 'class_88', 'class_90', 'class_92', 'class_95',
                                'class_99'])
class_weights = np.array([1,2,1,1,1,1,1,2,1,1,1,1,1,1,2])  # double weights for classes 15, 64, 99

In [4]:
df_train_meta = pd.read_csv('training_set_metadata.csv', sep = ',', header = 0, index_col = 'object_id',
                            dtype = {'target': np.object}
                           )
df_train_meta.target = df_train_meta.target.apply(lambda x: 'class_' + '0'*(2-len(x)) + x)
df_train_meta.target = df_train_meta.target.astype(target_type)

df_train_meta.drop(['distmod'], axis = 1, inplace = True) # удаляем, столбец не содержит дополнительной информации
df_train_meta.drop(['hostgal_specz'], axis = 1, inplace = True) # удаляем в простейшей модели, столбец не заполнен в test
print (df_train_meta.shape)
df_train_meta.head()

(7848, 9)


Unnamed: 0_level_0,ra,decl,gal_l,gal_b,ddf,hostgal_photoz,hostgal_photoz_err,mwebv,target
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.017,class_92
713,53.085938,-27.784405,223.525509,-54.460748,1,1.6267,0.2552,0.007,class_88
730,33.574219,-6.579593,170.455585,-61.548219,1,0.2262,0.0157,0.021,class_42
745,0.189873,-45.586655,328.254458,-68.969298,1,0.2813,1.1523,0.007,class_90
1124,352.711273,-63.823658,316.922299,-51.059403,1,0.2415,0.0176,0.024,class_90


In [5]:
df_test_meta = pd.read_csv('test_set_metadata.csv', sep = ',', header = 0, index_col = 'object_id')
df_test_meta.drop(['distmod'], axis = 1, inplace = True) # удаляем, столбец не содержит дополнительной информации
df_test_meta.drop(['hostgal_specz'], axis = 1, inplace = True) # удаляем в простейшей модели, столбец не заполнен в test
print (df_test_meta.shape)
df_test_meta.head()

  mask |= (ar1 == a)


(3492890, 8)


Unnamed: 0_level_0,ra,decl,gal_l,gal_b,ddf,hostgal_photoz,hostgal_photoz_err,mwebv
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
13,34.453125,-5.229529,169.987075,-59.956185,1,0.3193,0.0542,0.019
14,33.398438,-4.331149,167.226341,-59.936551,1,0.6323,0.0179,0.018
17,348.529419,-61.75544,321.29398,-51.763351,1,0.8297,0.0605,0.016
23,34.804688,-5.829153,171.307861,-60.174401,1,0.6533,0.1479,0.023
34,351.321442,-64.198746,317.458993,-50.429931,1,0.4617,0.0122,0.023


In [6]:
X = df_train_meta.iloc[:,:-1]
y = df_train_meta.iloc[:, -1]
print (X.shape, y.shape)

(7848, 8) (7848,)


In [7]:
# функция для расчёта метрики weighted multi-class logarithmic loss
def wmcll_error(y_true, pred_matrix):
    if pred_matrix.shape[1] == 14:  # добавляем столбец для class_99, если его нет
        pred_matrix = np.hstack((pred_matrix, np.zeros((len(y_true), 1)))) 
    eps = 1e-15
    pred_matrix = np.maximum(np.minimum(pred_matrix, 1-eps), eps)
    class_sizes = y_true.groupby(y_true).size()

    return -np.nansum(class_weights*np.sum(pd.get_dummies(y_true)*np.log(pred_matrix), axis = 0)/class_sizes)/sum(class_weights)

wmcll_score = make_scorer(wmcll_error, greater_is_better=False, needs_proba=True)

In [8]:
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
param_grid = {'n_estimators': [10, 20, 30, 50, 100, 200]}
#param_grid = {'C': [0.1, 1, 10, 100, 1000, 10000]}
xgb = XGBClassifier()
rf = RandomForestClassifier()
lgbm = LGBMClassifier()
lr = LogisticRegression(solver='liblinear', multi_class = 'ovr')

clf = GridSearchCV(estimator = xgb, param_grid = param_grid, cv = folds, 
                   scoring = wmcll_score, 
                   n_jobs = -1)
clf.fit(X, y) 
clf.cv_results_['mean_test_score']

array([-2.02379547, -1.95769943, -1.94093082, -1.94497694, -1.97168625,
       -2.0395069 ])

In [None]:
# xgb [10, 20, 30, 50, 100, 200]
[-2.02379547, -1.95769943, -1.94093082, -1.94497694, -1.97168625, -2.0395069]

# lgbm [10, 20, 30, 50, 100, 200]
[-2.33101753, -2.23863662, -2.23023125, -2.28075719, -2.5024662 , -3.00824671]

# rf [10, 20, 30, 50, 100, 200]
[-11.96981828,  -8.9723555 ,  -7.39487755,  -6.09076338, -4.88385825,  -4.04691795]

# lr [0.1, 1, 10, 100, 1000, 10000]
[-2.57042062, -2.40939692, -2.3454054 , -2.32283753, -2.31886785, -2.31898457]

In [9]:
clf = XGBClassifier(n_estimators = 30)
#clf = RandomForestClassifier(n_estimators = 1000, n_jobs = -1, random_state = 42)
clf.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=30,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [10]:
feature_imp = pd.DataFrame(list(zip(X.columns, clf.feature_importances_)), columns = ['col_name', 'importance'])
feature_imp.set_index('col_name', inplace = True)
feature_imp.sort_values('importance', ascending = False)

Unnamed: 0_level_0,importance
col_name,Unnamed: 1_level_1
hostgal_photoz,0.435976
hostgal_photoz_err,0.158537
mwebv,0.110772
gal_b,0.079268
gal_l,0.065549
ddf,0.053354
decl,0.04878
ra,0.047764


In [14]:
%%time
X_test = df_test_meta
prediction_proba = clf.predict_proba(X_test)
prediction = clf.predict(X_test)

CPU times: user 1min 29s, sys: 808 ms, total: 1min 29s
Wall time: 1min 29s


  if diff:


In [38]:
submission = pd.DataFrame(prediction_proba, index = df_test_meta.index, dtype = np.float16)
submission.columns = ['class_6', 'class_15', 'class_16', 'class_42', 'class_52', 'class_53',  'class_62', 
                      'class_64', 'class_65', 'class_67', 'class_88', 'class_90', 'class_92', 'class_95']
submission['class_99'] = 0.0
submission.head()

Unnamed: 0_level_0,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13,0.012054,0.014725,0.012054,0.102112,0.044495,0.012054,0.067383,0.0121,0.012054,0.049957,0.035339,0.601074,0.012054,0.012543,0.0
14,0.012314,0.015038,0.012314,0.104248,0.017456,0.012314,0.031128,0.012352,0.012314,0.017822,0.127563,0.564453,0.012314,0.048187,0.0
17,0.010277,0.01255,0.010277,0.085938,0.014038,0.010277,0.025528,0.010315,0.010277,0.022278,0.252441,0.438721,0.010277,0.086914,0.0
23,0.012199,0.014893,0.012199,0.110107,0.017609,0.012199,0.040314,0.012238,0.012199,0.033722,0.12323,0.55957,0.012199,0.027573,0.0
34,0.012772,0.015602,0.012772,0.102051,0.027283,0.012772,0.031738,0.012817,0.012772,0.01915,0.049927,0.662109,0.012772,0.015656,0.0


In [274]:
submission.to_csv('submit_RF1000_meta.csv', index = True)

In [74]:
submission_adj = submission * 8/9
submission_adj['class_99'] = 1/9
submission_adj.head()
submission_adj.to_csv('submit_XGB30_meta_class99_prop.csv', index = True, float_format='%.5f')

## Test submissions

### Equal probabilities

In [290]:
eq_matrix = np.ones((df_test_meta.shape[0], 15))/15
submission_equal = pd.DataFrame(eq_matrix, index = df_test_meta.index, dtype = np.float16)
submission_equal.columns = ['class_6', 'class_15', 'class_16', 'class_42', 'class_52', 'class_53', 'class_62',
                            'class_64', 'class_65', 'class_67', 'class_88', 'class_90', 'class_92', 'class_95',
                            'class_99']
submission_equal.head()

Unnamed: 0_level_0,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665
14,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665
17,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665
23,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665
34,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665,0.06665


In [198]:
submission_equal.to_csv('submit_eq.csv', index = True)

In [27]:
eq_matrix_train = np.ones((df_train_meta.shape[0], 15))/15
wmcll_error(y, eq_matrix_train)

2.4071557343130663

### All observations go to class 15

In [194]:
class15_matrix = np.hstack(
                            (
                            np.zeros((df_test_meta.shape[0], 1)), 
                            np.ones((df_test_meta.shape[0], 1)), 
                            np.zeros((df_test_meta.shape[0], 13))
                            )
                          )
submission_class15 = pd.DataFrame(class15_matrix, index = df_test_meta.index, dtype = np.float16)
submission_class15.columns = ['class_6', 'class_15', 'class_16', 'class_42', 'class_52', 'class_53', 'class_62',
                            'class_64', 'class_65', 'class_67', 'class_88', 'class_90', 'class_92', 'class_95',
                            'class_99']
submission_class15.head()

Unnamed: 0_level_0,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [195]:
submission_class15.to_csv('submit_class15.csv', index = True)

In [28]:
class15_matrix_train = np.hstack(
                            (
                            np.zeros((df_train_meta.shape[0], 1)), 
                            np.ones((df_train_meta.shape[0], 1)), 
                            np.zeros((df_train_meta.shape[0], 13))
                            )
                          )
wmcll_error(y, class15_matrix_train)

26.86349275159735

### Proportional to class weights

In [65]:
weights_norm = 1.0*class_weights/sum(class_weights)
prop_matrix = np.dot(np.ones((df_test_meta.shape[0], 1)), weights_norm.reshape((1,15)))
submission_prop = pd.DataFrame(prop_matrix, index = df_test_meta.index, dtype = np.float16)
submission_prop.columns = ['class_6', 'class_15', 'class_16', 'class_42', 'class_52', 'class_53', 'class_62',
                            'class_64', 'class_65', 'class_67', 'class_88', 'class_90', 'class_92', 'class_95',
                            'class_99']
submission_prop.head()

Unnamed: 0_level_0,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13,0.055542,0.111084,0.055542,0.055542,0.055542,0.055542,0.055542,0.111084,0.055542,0.055542,0.055542,0.055542,0.055542,0.055542,0.111084
14,0.055542,0.111084,0.055542,0.055542,0.055542,0.055542,0.055542,0.111084,0.055542,0.055542,0.055542,0.055542,0.055542,0.055542,0.111084
17,0.055542,0.111084,0.055542,0.055542,0.055542,0.055542,0.055542,0.111084,0.055542,0.055542,0.055542,0.055542,0.055542,0.055542,0.111084
23,0.055542,0.111084,0.055542,0.055542,0.055542,0.055542,0.055542,0.111084,0.055542,0.055542,0.055542,0.055542,0.055542,0.055542,0.111084
34,0.055542,0.111084,0.055542,0.055542,0.055542,0.055542,0.055542,0.111084,0.055542,0.055542,0.055542,0.055542,0.055542,0.055542,0.111084


In [292]:
submission_prop.to_csv('submit_prop.csv', index = True)

In [72]:
prop_matrix_train = np.dot(np.ones((df_train_meta.shape[0], 1)), weights_norm.reshape((1,15)))
wmcll_error(y, prop_matrix_train)

2.415186633561041

### Naive Benchmark (Galactic vs Extragalactic)

In [14]:
gal_classes = df_train_meta[df_train_meta['hostgal_photoz'] == 0].target.unique().tolist()
extragal_classes = df_train_meta[df_train_meta['hostgal_photoz'] != 0].target.unique().tolist()
gal_classes_ext = gal_classes + ['class_99']
extragal_classes_ext = extragal_classes + ['class_99']

submission_naive = pd.DataFrame(np.zeros((df_test_meta.shape[0], 15)), index = df_test_meta.index, dtype = np.float32)
submission_naive.columns = ['class_06', 'class_15', 'class_16', 'class_42', 'class_52', 'class_53', 'class_62',
                            'class_64', 'class_65', 'class_67', 'class_88', 'class_90', 'class_92', 'class_95',
                            'class_99']
test_galactic = df_test_meta['hostgal_photoz'] == 0
for col in gal_classes_ext:
    submission_naive.loc[test_galactic, col] = 1.0/len(gal_classes_ext)
for col in extragal_classes_ext:
    submission_naive.loc[~test_galactic, col] = 1.0/len(extragal_classes_ext)
submission_naive.rename(columns={'class_06': 'class_6'}, inplace = True)
submission_naive.to_csv('submit_naive_benchmark.csv', index = True, float_format='%.5f')
submission_naive.head()

Unnamed: 0_level_0,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13,0.0,0.1,0.0,0.1,0.1,0.0,0.1,0.1,0.0,0.1,0.1,0.1,0.0,0.1,0.1
14,0.0,0.1,0.0,0.1,0.1,0.0,0.1,0.1,0.0,0.1,0.1,0.1,0.0,0.1,0.1
17,0.0,0.1,0.0,0.1,0.1,0.0,0.1,0.1,0.0,0.1,0.1,0.1,0.0,0.1,0.1
23,0.0,0.1,0.0,0.1,0.1,0.0,0.1,0.1,0.0,0.1,0.1,0.1,0.0,0.1,0.1
34,0.0,0.1,0.0,0.1,0.1,0.0,0.1,0.1,0.0,0.1,0.1,0.1,0.0,0.1,0.1


In [25]:
train_naive = pd.DataFrame(np.zeros((df_train_meta.shape[0], 15)), index = df_train_meta.index, dtype = np.float32)
train_naive.columns = ['class_06', 'class_15', 'class_16', 'class_42', 'class_52', 'class_53', 'class_62',
                            'class_64', 'class_65', 'class_67', 'class_88', 'class_90', 'class_92', 'class_95',
                            'class_99']
train_galactic = df_train_meta['hostgal_photoz'] == 0
for col in gal_classes_ext:
    train_naive.loc[train_galactic, col] = 1.0/len(gal_classes_ext)
for col in extragal_classes_ext:
    train_naive.loc[~train_galactic, col] = 1.0/len(extragal_classes_ext)
wmcll_error(y, train_naive)

1.904847085655666