In [1]:
import numpy as np
from matplotlib import pyplot as plt
from numpy import genfromtxt
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


from sklearn import svm, datasets
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

# Getting training data

In [22]:
training_data = np.array(load_data("training_data.txt", skiprows = 1))

In [89]:
y_train = training_data[:, 0]
X_train = training_data[:, 1:]


In [90]:
test_data = np.array(load_data("test_data.txt"))
X_test = test_data


# Setting aside Valdiation Set

In [130]:
X_training, X_testing, y_training, y_testing = train_test_split(X_train, y_train, test_size=0.1, shuffle = True)

# Doing Logistic Regression

In [131]:
def binary_clf_error(y_pred, y_correct):
    '''Predicts binary classification error'''
    binarytrain_number = (y_pred!=y_correct).sum()
    return binarytrain_number/len(y_pred)

#Doing Logistic regression
log_reg = LogisticRegression(C = 0.15)
_ = log_reg.fit(X_training, y_training)
y_pred_log = log_reg.predict(X_training)
y_test_pred_log = log_reg.predict(X_testing)
y_actual_test_log = log_reg.predict(X_test)

In [132]:
binary_clf_error(y_pred_log, y_training)

0.12566666666666668

In [133]:
binary_clf_error(y_test_pred_log, y_testing)

0.14449999999999999

# Doing Ridge Regression

In [134]:
y_regression_train = y_training
y_regression_train[y_regression_train==0] = -1

y_regression_test = y_testing
y_regression_test[y_regression_test==0] = -1

#This is approximately the best one found
#Test error is like 0.1519
streng = 130

ridge = Ridge(alpha = 130)
ridge.fit(X_training, y_regression_train)   

y_pred_ridge = np.sign(ridge.predict(X_training))
y_test_pred_ridge = np.sign(ridge.predict(X_testing))
y_actual_test_ridge = np.sign(ridge.predict(X_test))

#Finding error
binary_clf_error(y_pred_ridge, y_regression_train)

0.13038888888888889

In [135]:
binary_clf_error(y_test_pred_ridge, y_regression_test)

0.15049999999999999

# Doing Lasso Regression

In [127]:
#This is approximately the best one found
#It got like 85% test accuracy.
streng = 0.00055

lasso = Lasso(alpha = streng)
lasso.fit(X_training, y_regression_train) 

y_pred_lasso = np.sign(lasso.predict(X_training))
y_test_pred_lasso = np.sign(lasso.predict(X_testing))
y_actual_test_lasso = np.sign(lasso.predict(X_test))

#Finding error
binary_clf_error(y_pred_lasso, y_regression_train)

0.13300000000000001

In [129]:
binary_clf_error(y_test_pred_lasso, y_regression_test)

0.13650000000000001

In [141]:
binary_clf_error(np.sign(np.mean([y_pred_lasso, y_pred_ridge, y_pred_log], axis =0)), y_regression_train)

0.12811111111111112

In [159]:
binary_clf_error(np.sign(np.mean([y_test_pred_lasso, y_test_pred_ridge, y_test_pred_log], axis =0)), y_regression_test)

0.15049999999999999

50

In [147]:
y_pred_lasso

array([-1.,  1., -1., ..., -1.,  1.,  1.])

In [71]:
c_values = np.linspace(0.0, 0.2, num=5)
training_error_list = []
validation_error_list = []

for c_value in c_values:
    #Initializing Kfold cv object
    kf = KFold(n_splits=15, shuffle = True)
    log_reg = LogisticRegression(C = 0.15)
    _ = log_reg.fit(X_training, y_training)
    
    
    training_error = []
    validation_error = []
    
    for train_index, test_index in kf.split(X_train):
        #Running Logistic regression on training partition
        
        training_error = (1-c_value)*y_test_pred_log+c_value*y_test_pred_ridge
        
        #Finding training error 
        training_error.append(binary_clf_error(log_reg.predict(X_training), y_training))

        #Finding validation error
        validation_error.append(binary_clf_error(log_reg.predict(X_testing), y_testing))
    training_error_list.append(np.mean(training_error))
    validation_error_list.append(np.mean(validation_error))

In [72]:
c_values, validation_error_list

(array([ 0.45 ,  0.475,  0.5  ,  0.525,  0.55 ]),
 [0.15075016880157072,
  0.15070071865792534,
  0.15110006887029107,
  0.15104844426998804,
  0.15015073108606988])