In [1]:
import numpy
import math
import csv

from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn import cross_validation, metrics, preprocessing   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search



In [2]:
def import_features(filepath):
    data = []
    with open(filepath, 'r') as csvfile:
        file = csv.reader(csvfile, delimiter=',')
        for row in file:
            line = []
            for i in range(0, 264):
                line.append(float(row[i]))
            data.append(line)
        return data

In [3]:
def import_y(filepath):
    return numpy.loadtxt(filepath)

In [4]:
def export_data_accuracy(y):
    with open('pred_label_accuracy.csv', 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['Sample_id','Sample_label'])
        for i in range(0, len(y)):
            index = i+1
            writer.writerow([str(index),str(y[i])])

In [5]:
def export_data_log_loss(y):
    with open('pred_label_loss.csv', 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['Sample_id','Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9','Class_10'])
        for i in range(0, len(y)):
            index = i+1
            temp = [str(index)]
            for k in range(0, len(y[i])):
                temp.append(str(y[i][k]))
            writer.writerow(temp)

In [6]:
def scale_features(Xtrain, Xtest):
    scaler = preprocessing.StandardScaler().fit(Xtrain)
    Xtrain = scaler.transform(Xtrain)
    Xtest = scaler.transform(Xtest)
    return Xtrain, Xtest

In [7]:
def modelfit(alg, x, y, test_set, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(x, y)
        
    #Predict training set:
    dtrain_predictions = alg.predict(test_set)
    dtrain_predprob = alg.predict_proba(test_set)
    
    return dtrain_predictions, dtrain_predprob
    
    #Print model report:
    #print("\nModel Report")
    #print("Accuracy : %.4g") % metrics.accuracy_score(y, dtrain_predictions)
    #print("AUC Score (Train): %f") % metrics.roc_auc_score(y, dtrain_predprob)
    
    #Print Feature Importance:
    #if printFeatureImportance:
        #feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        #feat_imp.plot(kind='bar', title='Feature Importances')
        #plt.ylabel('Feature Importance Score')

In [8]:
features = import_features('train_data.csv')
y = import_y('train_labels.csv')
y = [int(elem) for elem in y]

test_set = import_features('test_data.csv')
y_pred = []

gbm0 = GradientBoostingClassifier(max_depth = 5, random_state=10)
pred, prob = modelfit(gbm0, features, y, test_set)

export_data_accuracy(pred)
export_data_log_loss(prob)