In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
%pylab inline

In [None]:
data = pd.read_csv('train.csv', header=0, sep='\t')
test = pd.read_csv('test.csv', header=0, sep=',')
data.shape

In [None]:
from sklearn import preprocessing, model_selection
from tqdm import tqdm

encoders = {}
for column in tqdm(test.columns[2:4]):
    if test.dtypes[column] == np.object:
        encoders[column] = preprocessing.LabelEncoder()
        fit_column = data[column].copy()
        fit_column = fit_column.append(test[column])
        encoders[column].fit(fit_column)
        test[column] = encoders[column].transform(test[column])
        data[column] = encoders[column].transform(data[column])
        
data = data.dropna()

In [None]:
print(data.head())

In [None]:
print(test.head())

In [None]:
print(test.shape, "    ", data.shape)

In [None]:
data = data.drop('DSSR', 1)
X_train = data.drop('mg', 1)
y_train = data['mg']
X_test = test.drop(test.columns[[0, 1, 3]], axis=1)

In [None]:
from sklearn.preprocessing import scale
#X_train = scale(np.array(X_train, dtype='float'), with_std=True, with_mean=True)
#X_test = scale(np.array(X_test, dtype='float'), with_std=True, with_mean=True)
X = np.vstack((X_train, X_test))
X = scale(np.array(X, dtype='float'), with_std=True, with_mean=True)
X_train = X[:X_train.shape[0]]
X_test = X[X_train.shape[0]:]

In [None]:
print(X_test.shape, "    ", X_train.shape)

In [None]:
print(X_train[:3])

In [None]:
print(X_test[:3])

In [None]:
def write_answer(predictions):
    with open("result.csv", "w") as fout:
        fout.write("Id,mg\n")
        for i in range(predictions.shape[0]):
            fout.write(str(i)+","+str(predictions[i])+"\n")

In [None]:
%%time
estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=2, n_estimators=3000, 
                              colsample_bylevel = X_train.shape[1]**(1/2)/X_train.shape[1],  n_jobs=-1)
estimator.fit(X_train, y_train)
predictions = estimator.predict(X_test)

In [None]:
write_answer(predictions)

In [None]:
print(np.argmax(estimator.feature_importances_))

In [None]:
from xgboost import plot_importance
plot_importance(estimator)

In [None]:
from sklearn import cross_validation, grid_search, metrics

In [None]:
parameters_grid = {
    'learning_rate' : [0.05, 0.1, 0.2],
    'max_depth' : [2, 3, 4],
    'n_estimators' : 500,
    'colsample_bylevel' : [X_train.shape[1]**(1/2)/X_train.shape[1], 1],
}

In [None]:
cv = cross_validation.StratifiedShuffleSplit(y_train, n_splits = 10, test_size = 0.2)

In [None]:
classifier = xgb.XGBClassifier()

In [None]:
grid_cv = grid_search.GridSearchCV(classifier, parameters_grid, scoring = 'f1', cv = cv, n_jobs=4, pre_dispatch=8)

In [None]:
%%time
grid_cv.fit(X_train, y_train)

In [None]:
grid_cv.best_estimator_

In [None]:
print(grid_cv.best_score_)
print(grid_cv.best_params_)

In [None]:
from sklearn import cross_validation

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
%%time
xgb_scoring = []
n_trees = [1] + list(range(100,200,50))
for n_tree in tqdm(n_trees):
    estimator = xgb.XGBClassifier(n_estimators=n_tree)
    score = cross_validation.cross_val_score(estimator, X_train, y_train, 
                                             scoring = 'F1', cv = 5)    
    xgb_scoring.append(score)
xgb_scoring = np.asmatrix(xgb_scoring)

In [None]:
pylab.plot(n_trees, xgb_scoring.mean(axis = 1), marker='.', label='XGBoost')
pylab.grid(True)
pylab.xlabel('n_trees')
pylab.ylabel('score')
pylab.title('Accuracy score')
pylab.legend(loc='lower right')