In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math
import tensorflow as tf
import sys
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pickle
from statsmodels.regression.quantile_regression import QuantReg
import random

In [11]:
def split_data(data, test_ratio):
    def make2dList(rows, cols):
        a=[]
        for row in range(rows): a += [[0]*cols]
        return a
    test_size = int(len(data) * test_ratio)
    training_data = make2dList(len(data) - test_size, data[0]["quantile"].shape[0] * data[0]["quantile"].shape[1])
    training_labels = [0]*(len(data)-test_size)
    testing_data = make2dList(test_size, data[0]["quantile"].shape[0] * data[0]["quantile"].shape[1])
    testing_labels = [0]*test_size
    
    test = random.sample(range(0, len(data)), test_size)
    
    start = -1
    for key in range(len(data)):
        if (key in test): continue
        start += 1
        for j in range(data[0]["quantile"].shape[0]):
            for k in range(data[0]["quantile"].shape[1]):
                training_data[start][k*data[0]["quantile"].shape[0] + j] = data[key]["quantile"][j][k]
        training_labels[start] = data[key]["class"]
    
    start = -1
    for key in test:
        start += 1
        for j in range(data[0]["quantile"].shape[0]):
            for k in range(data[0]["quantile"].shape[1]):
                testing_data[start][k*data[0]["quantile"].shape[0] + j] = data[key]["quantile"][j][k]
        testing_labels[start] = data[key]["class"]
        
    
    return training_data, training_labels, testing_data, testing_labels

In [24]:
from sklearn import svm

clf = svm.SVC(C = 1000, kernel='linear')
data = pickle.load(open('../data/Quantiles.pickle','rb'))
def iteration(model):
    training_data, training_labels, testing_data, testing_labels = split_data(data, 0.1)
    training_labels_BZ = training_labels == np.array('Blazar')
    training_labels_CV = training_labels == np.array('CV')
    testing_labels_BZ = testing_labels == np.array('Blazar')
    testing_labels_CV = testing_labels == np.array('CV')
    print("Testing: %d Blazars and %d CV" % (np.count_nonzero(testing_labels_BZ), np.count_nonzero(testing_labels_CV)))
    model.fit(training_data, training_labels)
    pred = model.predict(testing_data)
    correct = pred == np.array(testing_labels)
    overall_acc = np.count_nonzero(correct) / len(testing_labels)
    BZ_acc = np.count_nonzero(np.bitwise_and(pred == np.array('Blazar'), correct)) / np.count_nonzero(testing_labels_BZ)
    CV_acc = np.count_nonzero(np.bitwise_and(pred == np.array('CV'), correct)) / np.count_nonzero(testing_labels_CV)
    return (BZ_acc, CV_acc, overall_acc)




In [25]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(clf, 100, algorithm = 'SAMME')

avg = 0
num = 10
for i in range(num):
    (bz, cv, avg) = iteration(clf)
    print(bz, cv)

Testing: 24 Blazars and 55 CV
0.7916666666666666 0.8363636363636363
Testing: 25 Blazars and 54 CV
0.72 0.8703703703703703
Testing: 24 Blazars and 55 CV
0.7083333333333334 0.7636363636363637
Testing: 18 Blazars and 61 CV
0.7777777777777778 0.8360655737704918
Testing: 23 Blazars and 56 CV
0.5652173913043478 0.8571428571428571
Testing: 26 Blazars and 53 CV
0.5769230769230769 0.8113207547169812
Testing: 19 Blazars and 60 CV
0.7894736842105263 0.7833333333333333
Testing: 20 Blazars and 59 CV
0.8 0.7966101694915254
Testing: 21 Blazars and 58 CV
0.6666666666666666 0.7931034482758621
Testing: 21 Blazars and 58 CV
0.9047619047619048 0.7931034482758621
