In [1]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from sklearn.decomposition import PCA, TruncatedSVD, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble

In [3]:
path = '/Users/xuweikang/Desktop/data/kaggle/Santander Value Prediction Challenge/'

In [7]:
train = pd.read_csv(path + 'train.csv')
test  = pd.read_csv(path + 'test.csv')

In [4]:
subm = pd.read_csv(path + 'sample_submission.csv')

In [5]:
train.shapepe

(4459, 4993)

In [6]:
test.shape

(49342, 4992)

In [29]:
col = [c for c in train.columns if c not in ['ID','target']]

In [12]:
sc1 = preprocessing.StandardScaler()

In [13]:
def rmsle(y, pred):
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(pred), 2)))

In [15]:
x1, x2, y1, y2 = model_selection.train_test_split(train[col], train.target.values, test_size=0.10, random_state=5)


In [18]:
model = ensemble.RandomForestRegressor(n_jobs = -1, random_state = 7)

In [19]:
model.fit(sc1.fit_transform(x1), y1)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=7, verbose=0, warm_start=False)

In [20]:
print(rmsle(y2, model.predict(sc1.transform(x2))))

1.769351134919298


In [25]:
model.feature_importances_

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       3.47945373e-08, 3.06281278e-08, 6.12971842e-05])

In [36]:
col_ = pd.DataFrame({'importance':model.feature_importances_ , 'feature':col})
col_ = col_.sort_values(by=['importance'], ascending=[False])
col_

Unnamed: 0,feature,importance
1530,b43a7cfd5,0.043665
4358,f190486d6,0.042918
3660,eeb9cd3aa,0.031903
3513,cbbc9c431,0.023110
2500,58e2e02e6,0.020967
118,963a49cdc,0.014024
562,26fc93eb7,0.011565
4021,d7d314edc,0.011202
4581,1702b5bf0,0.008940
1057,1c71183bb,0.008323


In [37]:
col_600 = col_[:600]['feature'].values

In [38]:
col_600

array(['b43a7cfd5', 'f190486d6', 'eeb9cd3aa', 'cbbc9c431', '58e2e02e6',
       '963a49cdc', '26fc93eb7', 'd7d314edc', '1702b5bf0', '1c71183bb',
       '00f844fea', '9fd594eec', '15ace8c9f', '58232a6fb', '823ac378c',
       'bf6e38e39', '4edc3388d', '66ace2992', '20aa07010', '1931ccfdd',
       'bb1113dbb', '2599a7eb7', 'fe919be32', '1af4d24fa', '884ec1cca',
       '06f6a7287', '9a5cd5171', '491b9ee45', '91f701ba2', '024c577b9',
       'c5a231d81', 'b7c931383', '277ef93fc', '0f2b86f4a', '13bdd610a',
       '4ecc3f505', '36a131c2c', '5a1589f1a', 'd6bb78916', '041c5d0c9',
       '6eef030c1', '8e4d0fe45', '0824edecb', '2d6bd8275', '205b0cfef',
       'e6c050854', 'd4c1de0e2', '5f6ea2fa9', '3e37bffde', 'd3022e2f1',
       'c1ad8b95a', '16b532cdc', 'c10f31664', 'f74e8f13d', 'c2dae3a5a',
       '2ec5b290f', '1db387535', '62e59a501', 'ba4ceabc5', '58e056e12',
       'ac30af84a', '6d0d72180', 'e13b0c0aa', '190db8488', '70feb1494',
       'bee629024', 'cc0045289', 'a029667de', '453128993', 'c976

In [44]:
train = train[['ID', 'target']+list(col_600)]
test = test[['ID']+list(col_600)]

In [45]:
train.shape

(4459, 602)

In [46]:
PERC_TRESHOLD = 0.98   ### Percentage of zeros in each feature ###
N_COMP = 20            ### Number of decomposition components ###

In [47]:
target = np.log1p(train['target']).values

In [48]:
cols_to_drop = [col for col in train.columns[2:]
                    if [i[1] for i in list(train[col].value_counts().items()) 
                    if i[0] == 0][0] >= train.shape[0] * PERC_TRESHOLD]

In [49]:
exclude_other = ['ID', 'target']
train_features = []
for c in train.columns:
    if c not in cols_to_drop \
    and c not in exclude_other:
        train_features.append(c)

In [51]:
train, test = train[train_features], test[train_features]

In [52]:
print("PCA")
pca = PCA(n_components=N_COMP, random_state=17)
pca_results_train = pca.fit_transform(train)
pca_results_test = pca.transform(test)

print("tSVD")
tsvd = TruncatedSVD(n_components=N_COMP, random_state=17)
tsvd_results_train = tsvd.fit_transform(train)
tsvd_results_test = tsvd.transform(test)

print("ICA")
ica = FastICA(n_components=N_COMP, random_state=17)
ica_results_train = ica.fit_transform(train)
ica_results_test = ica.transform(test)

print("GRP")
grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=17)
grp_results_train = grp.fit_transform(train)
grp_results_test = grp.transform(test)

print("SRP")
srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=17)
srp_results_train = srp.fit_transform(train)
srp_results_test = srp.transform(test)

PCA
tSVD
ICA




GRP
SRP


In [53]:
for i in range(1, N_COMP + 1):
    train['pca_' + str(i)] = pca_results_train[:, i - 1]
    test['pca_' + str(i)] = pca_results_test[:, i - 1]

    train['ica_' + str(i)] = ica_results_train[:, i - 1]
    test['ica_' + str(i)] = ica_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

In [54]:
train.shape

(4459, 599)

In [55]:
train.columns

Index(['b43a7cfd5', 'f190486d6', 'eeb9cd3aa', 'cbbc9c431', '58e2e02e6',
       '963a49cdc', '26fc93eb7', 'd7d314edc', '1702b5bf0', '1c71183bb',
       ...
       'pca_19', 'ica_19', 'tsvd_19', 'grp_19', 'srp_19', 'pca_20', 'ica_20',
       'tsvd_20', 'grp_20', 'srp_20'],
      dtype='object', length=599)

# lightgbm

In [12]:
import gc
import lightgbm as lgb

In [4]:
id_col = 'ID'
target_var = 'target'

In [5]:
results = pd.DataFrame(columns = ["Rounds","Score","STDV", "LB", "Parameters"])

In [6]:
training = pd.read_csv(path + 'train.csv', index_col=id_col)
test = pd.read_csv(path + 'test.csv', index_col=id_col)

In [7]:
trainindex = training.index
testindex = test.index

In [8]:
y = np.log1p(training[target_var])

In [9]:
training.drop(target_var, axis=1, inplace=True)

In [10]:
df = pd.concat([training, test], axis=0)

In [65]:
del training,test

In [16]:
print('all data shape', df.shape)

all data shape (53801, 4991)


In [13]:
lgtrain = lgb.Dataset(training, y, feature_name='auto')

In [14]:
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    "learning_rate": 0.01,
    "num_leaves": 200,
    "feature_fraction": 0.50,
    "bagging_fraction": 0.50,
    'bagging_freq': 4,
    "max_depth": -1,
    "reg_alpha": 0.3,
    "reg_lambda": 0.1,
    #"min_split_gain":0.2,
    "min_child_weight":10,
    'zero_as_missing':True
}

In [15]:
lgb_cv = lgb.cv(
    params = lgbm_params,
    train_set = lgtrain,
    num_boost_round=2500,
    stratified=False,
    nfold = 5,
    verbose_eval=50,
    seed = 23,
    early_stopping_rounds=75)

[50]	cv_agg's rmse: 1.58778 + 0.0291119
[100]	cv_agg's rmse: 1.50476 + 0.0302859
[150]	cv_agg's rmse: 1.46314 + 0.0310064
[200]	cv_agg's rmse: 1.44235 + 0.0327227
[250]	cv_agg's rmse: 1.4327 + 0.0340882
[300]	cv_agg's rmse: 1.4287 + 0.0337906
[350]	cv_agg's rmse: 1.42752 + 0.0340438
[400]	cv_agg's rmse: 1.42737 + 0.034524


In [16]:
best_cv_score = min(lgb_cv['rmse-mean'])
best_cv_score

1.4270237364585143

In [17]:
optimal_rounds = np.argmin(lgb_cv['rmse-mean'])
optimal_rounds

364

In [25]:
lgb_cv['rmse-mean'][364]

1.4270237364585143

In [18]:
results = results.append({"Rounds": optimal_rounds,
                          "Score": best_cv_score,
                          "STDV": lgb_cv['rmse-stdv'][optimal_rounds],
                          "LB": None,
                          "Parameters": lgbm_params}, ignore_index=True)

In [19]:
results

Unnamed: 0,Rounds,Score,STDV,LB,Parameters
0,364,1.427024,0.034136,,"{'task': 'train', 'boosting_type': 'gbdt', 'ob..."


In [20]:
learning_rates = [0.012, 0.008, 0.016]
for learning_rate in learning_rates:
    lgbm_params['learning_rate'] = learning_rate
    lgb_cv = lgb.cv(
    params = lgbm_params,
    train_set = lgtrain,
    num_boost_round=10000,
    stratified=False,
    nfold = 5,
    verbose_eval=200,
    seed = 23,
    early_stopping_rounds=75)
    
    optimal_rounds = np.argmin(lgb_cv['rmse-mean'])
    best_cv_score = min(lgb_cv['rmse-mean'])
    print("Optimal Round: {}\nOptimal Score: {} + {}".format(
    optimal_rounds,best_cv_score,lgb_cv['rmse-stdv'][optimal_rounds]))

    results = results.append({"Rounds": optimal_rounds,
                              "Score": best_cv_score,
                              "STDV": lgb_cv['rmse-stdv'][optimal_rounds],
                              "LB": None,
                              "Parameters": lgbm_params}, ignore_index=True)

[200]	cv_agg's rmse: 1.4334 + 0.0332999
Optimal Round: 284
Optimal Score: 1.4270685508840888 + 0.03345801084845614
[200]	cv_agg's rmse: 1.45709 + 0.0317034
[400]	cv_agg's rmse: 1.42695 + 0.0341139
Optimal Round: 444
Optimal Score: 1.4262893531048793 + 0.03482910483946859
[200]	cv_agg's rmse: 1.42833 + 0.0348164
Optimal Round: 224
Optimal Score: 1.42726220505991 + 0.03560193963576617


In [21]:
final_model_params = results.iloc[results['Score'].idxmin(),:]['Parameters']
optimal_rounds = results.iloc[results['Score'].idxmin(),:]['Rounds']

In [22]:
muti_seed_pred = dict()
all_feature_importance_df = pd.DataFrame()

In [23]:
all_seed = [27,22,300,401,7]
for seed in all_seed:
    final_model_params['seed'] = seed
    lgb_reg = lgb.train(
    final_model_params,
    lgtrain,
    num_boost_round = optimal_rounds + 1,
    verbose_eval=200)
        
    fold_importance_df = pd.DataFrame()
    fold_importance_df['feature'] = training.columns
    fold_importance_df['importance'] = lgb_reg.feature_importance()
    all_feature_importance_df = pd.concat([all_feature_importance_df,fold_importance_df], axis=0)
    
    muti_seed_pred[seed] = list(lgb_reg.predict(test))
    

In [26]:
muti_seed_pred.keys()

dict_keys([27, 22, 300, 401, 7])

In [27]:
sub_preds = pd.DataFrame.from_dict(muti_seed_pred).replace(0,0.000001)

In [28]:
sub_preds.head(2)

Unnamed: 0,7,22,27,300,401
0,14.142321,14.366203,14.412266,14.319866,14.367344
1,14.51223,14.566528,14.497126,14.546333,14.555865


In [29]:
lgb_ans = np.expm1(sub_preds.mean(axis=1))

In [30]:
mean_sub = np.expm1(sub_preds.mean(axis=1).rename(target_var))

### CatBoost  + decomposition features

In [4]:
from catboost import CatBoostRegressor

In [5]:
train = pd.read_csv(path + 'train.csv')
test  = pd.read_csv(path + 'test.csv')

In [13]:
col = [c for c in train.columns if c not in ['ID','target']]

In [7]:
scl = preprocessing.StandardScaler()

In [14]:
x1,x2,y1,y2 = model_selection.train_test_split(train[col], train['target'], test_size=0.1, random_state=5)

In [15]:
model = ensemble.RandomForestRegressor(n_jobs=-1, random_state=7)

In [17]:
model.fit(scl.fit_transform(x1), y1)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=7, verbose=0, warm_start=False)

In [19]:
rnd_feature_importance = pd.DataFrame({'importance':model.feature_importances_, 'feature':col})

In [21]:
rnd_feature_importance = rnd_feature_importance.sort_values(by=['importance'], ascending=[False])

In [23]:
rnd_feature_importance.head(3)

Unnamed: 0,feature,importance
1530,b43a7cfd5,0.043665
4358,f190486d6,0.042918
3660,eeb9cd3aa,0.031903


In [24]:
col_imporatnce = rnd_feature_importance[:600]['feature'].values

In [41]:
target = np.log1p(train['target']).values

In [28]:
train = train[['ID','target']+ list(col_imporatnce)]
test = test[['ID']+ list(col_imporatnce)]

In [31]:
cols_to_drop = [col for col in train.columns[2:]
                    if [i[1] for i in list(train[col].value_counts().items()) 
                    if i[0] == 0][0] >= train.shape[0] * PERC_TRESHOLD]

print("Define training features...")
exclude_other = ['ID', 'target']
train_features = []
for c in train.columns:
    if c not in cols_to_drop \
    and c not in exclude_other:
        train_features.append(c)

Define training features...


In [47]:
train, test = train[train_features], test[train_features]

In [48]:
PERC_TRESHOLD = 0.98   ### Percentage of zeros in each feature ###
N_COMP = 20            ### Number of decomposition components ###

print("PCA")
pca = PCA(n_components=N_COMP, random_state=17)
pca_results_train = pca.fit_transform(train)
pca_results_test = pca.transform(test)

print("tSVD")
tsvd = TruncatedSVD(n_components=N_COMP, random_state=17)
tsvd_results_train = tsvd.fit_transform(train)
tsvd_results_test = tsvd.transform(test)

print("ICA")
ica = FastICA(n_components=N_COMP, random_state=17)
ica_results_train = ica.fit_transform(train)
ica_results_test = ica.transform(test)

print("GRP")
grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=17)
grp_results_train = grp.fit_transform(train)
grp_results_test = grp.transform(test)

print("SRP")
srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=17)
srp_results_train = srp.fit_transform(train)
srp_results_test = srp.transform(test)

PCA
tSVD
ICA
GRP
SRP


In [49]:
for i in range(1, N_COMP+1):
    train['pac_'+ str(i)] = pca_results_train[:,i-1]
    test['pca_' + str(i)] = pca_results_test[:, i - 1]

    train['ica_' + str(i)] = ica_results_train[:, i - 1]
    test['ica_' + str(i)] = ica_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

In [37]:
print('Train shape: {} \n Test shape {}'.format(train.shape, test.shape))

Train shape: (4459, 599) 
 Test shape (49342, 599)


In [38]:
folds = KFold(n_splits=5, shuffle=True, random_state=546789)
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])

In [50]:
for n_fold ,(trn_idx,val_idx) in enumerate(folds.split(train)):
    trn_x,trn_y = train.ix[trn_idx],target[trn_idx]
    val_x,val_y = train.ix[val_idx], target[val_idx]
    cb_model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=4, l2_leaf_reg=20, bootstrap_type='Bernoulli', subsample=0.6, eval_metric='RMSE', metric_period=50, od_type='Iter', od_wait=45, random_seed=17, allow_writing_files=False)
    cb_model.fit(trn_x, trn_y, eval_set=(val_x, val_y), cat_features=[], use_best_model=True, verbose=True)
    
    oof_preds[val_idx] = cb_model.predict(val_x)
    sub_preds += cb_model.predict(test) / folds.n_splits

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


0:	learn: 13.1850612	test: 13.2297734	best: 13.2297734 (0)	total: 88.2ms	remaining: 1m 28s
50:	learn: 1.5889104	test: 1.6843622	best: 1.6843622 (50)	total: 1.4s	remaining: 26s
100:	learn: 1.4176635	test: 1.5735296	best: 1.5735296 (100)	total: 2.72s	remaining: 24.2s
150:	learn: 1.3325505	test: 1.5379366	best: 1.5379366 (150)	total: 4.08s	remaining: 23s
200:	learn: 1.2830573	test: 1.5274097	best: 1.5272569 (199)	total: 5.49s	remaining: 21.8s
250:	learn: 1.2419866	test: 1.5281683	best: 1.5257328 (237)	total: 6.87s	remaining: 20.5s
Stopped by overfitting detector  (45 iterations wait)

bestTest = 1.525732813
bestIteration = 237

Shrink model to first 238 iterations.
0:	learn: 13.2384607	test: 13.1221075	best: 13.1221075 (0)	total: 28.9ms	remaining: 28.9s
50:	learn: 1.6216366	test: 1.6337336	best: 1.6337336 (50)	total: 1.49s	remaining: 27.7s
100:	learn: 1.4568420	test: 1.4794760	best: 1.4794760 (100)	total: 2.89s	remaining: 25.7s
150:	learn: 1.3773068	test: 1.4260025	best: 1.4260025 (150)	t

In [51]:
cb_ans = np.expm1(sub_preds)