In [1]:
import numpy as np
from matplotlib import pyplot as plt
from numpy import genfromtxt
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


from sklearn import svm, datasets
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

# Getting training data

In [3]:
training_data = np.array(load_data("training_data.txt", skiprows = 1))

In [4]:
y_train = training_data[:, 0]
X_train = training_data[:, 1:]

In [5]:
test_data = np.array(load_data("test_data.txt"))
X_test = test_data

In [6]:
def binary_clf_error(y_pred, y_correct):
    '''Predicts binary classification error'''
    binarytrain_number = (y_pred!=y_correct).sum()
    return binarytrain_number/len(y_pred)

In [7]:
#This was the optimal regularizing term found.
log_reg = LogisticRegression(C = 0.00001)
_ = log_reg.fit(X_train, y_train)

In [8]:
#Splitting Data
X_training, X_testing, y_training, y_testing = train_test_split(X_train, y_train, test_size=0.1, shuffle = True)

In [9]:
X_training.shape

(18000, 1000)

# Adaboost

In [17]:
dt_stump = DecisionTreeClassifier(max_depth=2, min_samples_leaf=1)
ada_discrete = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=0.6, n_estimators=40, algorithm="SAMME.R")
ada_discrete.fit(X_training, y_training)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.6, n_estimators=40, random_state=None)

In [18]:
binary_clf_error(ada_discrete.predict(X_training), y_training)

0.17888888888888888

In [19]:
binary_clf_error(ada_discrete.predict(X_testing), y_testing)

0.1885

# Gradient Boosting

In [76]:
grd = GradientBoostingClassifier(n_estimators=100)
grd.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [77]:
binary_clf_error(grd.predict(X_training), y_training)

0.18566666666666667

In [78]:
binary_clf_error(grd.predict(X_testing), y_testing)

0.17949999999999999