In [74]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD

# read datasets
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

train_idx = train.ID.tolist()
test_idx = test.ID.tolist()

y_train = train["y"]
y_mean = np.mean(y_train)

full_df = pd.concat([train.drop('y', axis=1), test])
full_df = full_df.set_index('ID')

# process columns, apply LabelEncoder to categorical features
for c in full_df.columns:
    if full_df[c].dtype == 'object':
        full_df = pd.concat([full_df, pd.get_dummies(full_df[c], prefix=c)], axis=1)
        full_df.drop(c, axis=1, inplace=True)

# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

##Add decomposed components: PCA / ICA etc.
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=42)
tsvd_results_full = tsvd.fit_transform(full_df)

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_full = pca.fit_transform(full_df)

# ICA
ica = FastICA(n_components=n_comp, random_state=42, max_iter=1000, tol=0.01)
ica2_results_full = ica.fit_transform(full_df)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    full_df['pca_' + str(i)] = pca2_results_full[:, i-1]
    full_df['ica_' + str(i)] = ica2_results_full[:, i-1]


Shape train: (4209, 378)
Shape test: (4209, 377)


In [75]:
from sklearn import decomposition
import pylab as pl
from sklearn.neighbors import DistanceMetric
from sklearn.cluster import DBSCAN
import sklearn.metrics as metrics
from sklearn.preprocessing import StandardScaler

#cluster3d(full_df, {'n_components': 5, 'eps': 0.500000, 'min_samples': 22, 'n_jobs': -1}, 'euclidean')
best_params = {'n_components': 11, 'eps': 0.750000, 'min_samples': 2, 'n_jobs': -1}

pca = decomposition.PCA(n_components=best_params['n_components'])
full_df_reduced = pca.fit_transform(full_df)
full_df_reduced = StandardScaler().fit_transform(full_df_reduced)
full_df_reduced = pd.DataFrame(full_df_reduced, index = full_df.index.values)

clusters = DBSCAN(eps=best_params['eps'], min_samples=best_params['min_samples'], n_jobs=best_params['n_jobs']).fit_predict(full_df_reduced)
clusters_labels = pd.unique(pd.Series(clusters))
clusters_df = pd.DataFrame(clusters, index = full_df_reduced.index, columns=['cluster'])
clusters_labels_all = clusters_df
count = pd.Series(clusters).value_counts()
score = metrics.silhouette_score(full_df_reduced, clusters)
print("Silhouette Coefficient: %0.3f" % score)

Silhouette Coefficient: 0.382


In [76]:
full_df = pd.concat([full_df, clusters_df], axis=1)

full_df = pd.concat([full_df, pd.get_dummies(full_df.cluster, prefix='cluster')], axis=1)
full_df.drop('cluster', axis=1, inplace=True)

X_train = full_df.loc[train_idx]
X_test = full_df.loc[test_idx]

In [102]:
##### Regressor
import xgboost as xgb
from sklearn.metrics import r2_score

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 1500,
    'eta': 0.01,
    'max_depth': 3,
    'subsample': 0.65,
    'objective': 'reg:linear',
    #'objective': 'reg:logistic',
    'eval_metric': 'rmse',
    'base_score': y_mean,
    'silent': 0,
    'n_jobs': -1,
    #'tree_method': 'hist',
    #'max_bin': 255,
    'booster': 'gbtree',
    'rate_drop': 0.10,
    #'alpha': 0.05
}
"""
    rate_drop [default=0.0]
        dropout rate (a fraction of previous trees to drop during the dropout).
        range: [0.0, 1.0]
    booster [default=gbtree]
        which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function.
    lambda [default=1, alias: reg_lambda]
        L2 regularization term on weights, increase this value will make model more conservative.
    alpha [default=0, alias: reg_alpha]
        L1 regularization term on weights, increase this value will make model more conservative.
"""

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test)

num_boost_rounds = 2500
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

# check r2-score (to get higher score - increase num_boost_round in previous cell)
print(r2_score(model.predict(dtrain), dtrain.get_label()))

0.565246623513


In [None]:
#dropout + L1 = 0.565941022665 valid 0.38 test
#only dropout = 

In [None]:
# 0.472392002894

xgb.train(data = xgb_data,
                        nthread = i,
                        nrounds = 200,
                        max_leaves = 255,
                        max_depth = 12,
                        eta = 0.05,
                        tree_method = "hist",
                        max_bin = 255,
                        booster = "gbtree",
                        objective = "binary:logistic",
                        verbose = 2)

In [99]:
write_to_submission_file(model.predict(dtest), 'xgboost_clustering_hist_dropout.csv', index=X_test.index.values)

In [None]:
# monitor training performance
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
Y = dataset[:,8]
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
# fit model no training data
model = XGBClassifier()
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_metric="error", eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing

import numpy as np
import pandas as pd

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import sys
import xgboost as xgb

def score(params):
    print "Training with params : "
    print params
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)
    # watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    model = xgb.train(params, dtrain, num_round)
    predictions = model.predict(dvalid).reshape((X_test.shape[0], 9))
    score = log_loss(y_test, predictions)
    print "\tScore {0}\n\n".format(score)
    return {'loss': score, 'status': STATUS_OK}


def optimize(trials):
    space = {
             'n_estimators' : hp.quniform('n_estimators', 100, 1000, 1),
             'eta' : hp.quniform('eta', 0.025, 0.5, 0.025),
             'max_depth' : hp.quniform('max_depth', 1, 13, 1),
             'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
             'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
             'num_class' : 9,
             'eval_metric': 'mlogloss',
             'objective': 'multi:softprob',
             'nthread' : 6,
             'silent' : 1
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print best


X, y = load_train()
print "Splitting data into train and valid ...\n\n"
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234)

#Trials object where the history of search will be stored
trials = Trials()

optimize(trials)

In [82]:
def write_to_submission_file(predicted, out_file, target='y', index=None, index_label="ID"):
    if index is None:
        index = np.arange(1, predicted.shape[0] + 1)
    predicted_df = pd.DataFrame(predicted,
                                index = index,
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)