In [0]:
import xgboost as xgb
import pandas as pd
import numpy as np
%pylab inline

In [0]:
data = pd.read_csv('https://www.dropbox.com/s/lo1duo1e4q12z8r/train.csv?dl=1', header=0, sep='\t')
test = pd.read_csv('https://www.dropbox.com/s/qiy76n81kgxz3ky/test.csv?dl=1', header=0, sep=',')
data.shape

In [0]:
!pip install tqdm

In [0]:
from sklearn import preprocessing, model_selection
from tqdm import tqdm

encoders = {}
for column in tqdm(test.columns[2:4]):
    if test.dtypes[column] == np.object:
        encoders[column] = preprocessing.LabelEncoder()
        fit_column = data[column].copy()
        fit_column = fit_column.append(test[column])
        encoders[column].fit(fit_column)
        test[column] = encoders[column].transform(test[column])
        data[column] = encoders[column].transform(data[column])
        
data = data.dropna()

In [0]:
data = data.drop('DSSR', 1)
X_train = data.drop('mg', 1)
y_train = data['mg']
X_test = test.drop(test.columns[[0, 1, 3]], axis=1)

In [0]:
from sklearn.preprocessing import scale
X = np.vstack((X_train, X_test))
X = scale(np.array(X, dtype='float'), with_std=True, with_mean=True)
X_train = X[:X_train.shape[0]]
X_test = X[X_train.shape[0]:]

In [0]:
def write_answer(predictions):
    with open("result.csv", "w") as fout:
        fout.write("Id,mg\n")
        for i in range(predictions.shape[0]):
            fout.write(str(i)+","+str(predictions[i])+"\n")

In [0]:
%%time
estimator = xgb.XGBClassifier(learning_rate=0.2, max_depth=2, n_estimators=3000, 
                              colsample_bytree = X_train.shape[1]**(1/2)/X_train.shape[1],  n_jobs=-1, reg_alpha=0.005)
estimator.fit(X_train, y_train)
predictions = estimator.predict(X_test)

In [0]:
write_answer(predictions)

In [0]:
print(np.argmax(estimator.feature_importances_))

In [0]:
from xgboost import plot_importance
plot_importance(estimator)

In [0]:
from sklearn import cross_validation, grid_search, metrics

In [0]:
parameters_grid = {
    'learning_rate' : [0.05, 0.1, 0.2],
    'max_depth' : [2, 3, 4, 5, 10],
    'n_estimators' : [5000],
    'colsample_bylevel' : [X_train.shape[1]**(1/2)/X_train.shape[1], 1],
}

In [0]:
cv = cross_validation.StratifiedShuffleSplit(y_train, n_iter = 10, test_size = 0.2)

In [0]:
classifier = xgb.XGBClassifier()

In [0]:
grid_cv = grid_search.GridSearchCV(classifier, parameters_grid, scoring = 'f1', cv = cv, n_jobs=4, pre_dispatch=8, verbose=16)

In [0]:
%%time
grid_cv.fit(X_train, y_train)

In [0]:
grid_cv.best_estimator_

In [0]:
print(grid_cv.best_score_)
print(grid_cv.best_params_)

In [0]:
%%time
estimator = grid_cv.best_estimator_
estimator.fit(X_train, y_train)
predictions = estimator.predict(X_test)

In [0]:
write_answer(predictions)

In [0]:
print(np.argmax(estimator.feature_importances_))

In [0]:
from xgboost import plot_importance
plot_importance(estimator)

In [0]:
classifier = xgb.XGBClassifier(learning_rate=0.2, max_depth=3, n_estimators=10000, 
                              colsample_bytree = X_train.shape[1]**(1/2)/X_train.shape[1],  n_jobs=-1, reg_alpha=0.005)

In [0]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier, X_train, y_train, cv=cv, scoring='f1')
print(scores)

In [0]:
if (scores.mean > 0.35):
  classifier.fit(X_train, y_train)
  predictions = estimator.predict(X_test)
  write_answer(predictions)