In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import zero_one_loss, make_scorer


In [2]:
DATA = 'hw2_lssvm_all.dat'

data = pd.read_csv(DATA, sep=' ', header=None)
data = data.drop(0, axis=1)
data.head()

LABEL = 11


In [3]:
trainingData = data[:400]
trainingX = trainingData[trainingData.columns[0:-1]]
trainingY = trainingData[trainingData.columns[-1]]

In [4]:
trainingX.head()


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,4.115,5.02,-7.879,-11.78,2.004,-0.353,-0.735,3.561,2.441,-9.822
1,-3.557,0.997,2.932,7.672,5.43,-0.137,1.635,-5.19,-0.394,-7.667
2,6.417,5.878,5.066,-7.209,-6.953,7.639,-2.937,-1.023,3.963,-11.069
3,-2.247,6.532,6.437,2.293,6.302,2.187,3.429,-3.453,9.172,-4.548
4,3.708,5.834,3.676,-4.403,-5.296,9.08,-3.11,-3.294,3.189,-8.51


In [5]:
trainingY.head()


0    1
1    1
2    1
3    1
4    1
Name: 11, dtype: int64

In [6]:
testingData = data[400:]
testingX = testingData[testingData.columns[0:-1]]
testingY = testingData[testingData.columns[-1]]



In [7]:
def CalculatedSampleError(ridge : RidgeClassifier, xs : pd.DataFrame, ys : pd.DataFrame) -> float:
    predictionResults = ridge.predict(xs) * np.array(ys)
    incorrect = np.sum(np.array(predictionResults) < 0)
    return incorrect / len(predictionResults)

def RidgedClassification():
    # the alpha used in sklearn corresponds to the lambda used in our course
    ridge = RidgeClassifier(alpha = lam)
    ridge.fit(trainingX, trainingY)

    Ein = CalculatedSampleError(ridge,trainingX, trainingY)
    Eout = CalculatedSampleError(ridge, testingX, testingY)
    return Ein, Eout
    
def CalculatedSampleErrorAggregation(ridges : list, xs : pd.DataFrame, ys : pd.DataFrame) -> float:
    predictions = []
    for ridge in ridges:
        predictions.append(ridge.predict(xs))
    
    predictionResults = np.sum(predictions, axis=0) * np.array(ys)
    incorrect = np.sum(np.array(predictionResults) < 0)
    return incorrect / len(predictionResults)

def BaggedRidgeClassification():
    ridges = []
    for _ in range(250):
    # the alpha used in sklearn corresponds to the lambda used in our course
        baggedIndex = random.choices(range(400), k=400)
        baggedX = trainingX.iloc[baggedIndex]
        baggedY = trainingY.iloc[baggedIndex]
        ridge = RidgeClassifier(alpha = lam)
        ridge.fit(baggedX, baggedY)
        ridges.append(ridge)

    Ein = CalculatedSampleErrorAggregation(ridges,trainingX, trainingY)
    Eout = CalculatedSampleErrorAggregation(ridges, testingX, testingY)
    return Ein, Eout


# Testing Ridge Regression

In [8]:
lambdas = [0.05, 0.5, 5, 50, 500]
for lam in lambdas:
    # the alpha used in sklearn corresponds to the lambda used in our course
    Ein, Eout = RidgedClassification()
    print(f"RidgeClassifier regression with lambda: {lam}\n\t\t Ein = {Ein}\tEout = {Eout}")


RidgeClassifier regression with lambda: 0.05
		 Ein = 0.3175	Eout = 0.36
RidgeClassifier regression with lambda: 0.5
		 Ein = 0.3175	Eout = 0.36
RidgeClassifier regression with lambda: 5
		 Ein = 0.3175	Eout = 0.36
RidgeClassifier regression with lambda: 50
		 Ein = 0.32	Eout = 0.37
RidgeClassifier regression with lambda: 500
		 Ein = 0.3225	Eout = 0.37


# Testing Ridge Regression with Bagged Data (250 iters.)

In [9]:
for lam in lambdas:
    Ein, Eout = BaggedRidgeClassification()
    print(f"BaggedRidgeClassifier regression with lambda: {lam}\n\t\t Ein = {Ein}\tEout = {Eout}")




BaggedRidgeClassifier regression with lambda: 0.05
		 Ein = 0.315	Eout = 0.37
BaggedRidgeClassifier regression with lambda: 0.5
		 Ein = 0.32	Eout = 0.37
BaggedRidgeClassifier regression with lambda: 5
		 Ein = 0.3175	Eout = 0.36
BaggedRidgeClassifier regression with lambda: 50
		 Ein = 0.32	Eout = 0.37
BaggedRidgeClassifier regression with lambda: 500
		 Ein = 0.3225	Eout = 0.37
