In [1]:
import sklearn.svm
import numpy as np
import csv
import math
import sys
np.set_printoptions(threshold=sys.maxsize)
from sklearn.svm import SVC
from sklearn import metrics
from sklearn import svm
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt


def read_data():
    data = []
    with open('train.csv') as csvfile:
        reader = csv.reader(csvfile)
        headers = next(reader) # take the header out
        for row in reader: # each row is a list
            data.append(row)
    data  = np.array(data, dtype = np.float)
    X = data[:,:-1]
    y = data[:,-1]

    return X, y

def read_test_data():
    data = []
    with open('test.csv') as csvfile:
        reader = csv.reader(csvfile)
        headers = next(reader) # take the header out
        for row in reader: # each row is a list
            data.append(row)
    data  = np.array(data, dtype = np.float)
    X = data[:,:-1]
    y = data[:,-1]

    return X, y

X, y = read_data()
X_test, y_test = read_test_data()
array = np.zeros((y.size + 1, y.size + 1))

def kernal(x_i,x_j, quant):
    x = x_i - x_j
    norm = np.linalg.norm(x)
    final_val = math.exp(-quant * norm**2)
    return final_val


def populate(array, gamma, C):
    for i in range(0, y.size):
        array[i + 1][0] = 1
        for j in range(0, y.size):
            array[i + 1][j + 1] = kernal(X[j], X[i], gamma)



    for i in range(1, array.shape[0]):
        array[0][i] = 1
        array[i][i] += (1/C)


def predict(train_X, data, solution, C):
    s = 0
    for i in range(1,solution.size):
        s += solution[i] * kernal(train_X[i-1], data, C)

    s += solution[0]
    if s >= 0:
        result = 1
    else:
        result = -1
    return result

def results(train_X, train_Y, test_X, test_Y, solution, gamma):
    train_pred = np.zeros(train_Y.size)
    test_pred = np.zeros(test_Y.size)
    for i in range(train_Y.size):
        train_pred[i] = predict(train_X, train_X[i], solution, gamma)

    for i in range(test_Y.size):
        test_pred[i] = predict(train_X, test_X[i], solution, gamma)

    LSSVM_train_error = eval(train_pred, train_Y)
    LSSVM_test_error = eval(test_pred, test_Y)
    return LSSVM_train_error, LSSVM_test_error


def eval(predicted, actual):
    print(1-metrics.accuracy_score(actual, predicted))
    return 1-metrics.accuracy_score(actual, predicted)



# populate(array, 1, 1)
# new_y = np.append(0,y)
# solution = np.linalg.solve(array, new_y)
# results(X,y, X_test,y_test,solution, 1)

def LSSVM(X,y, X_test, y_test, gamma, C):
    array = np.zeros((y.size + 1, y.size + 1))
    populate(array, gamma, C)
    new_y = np.append(0, y)
    solution = np.linalg.solve(array, new_y)
    error_train, error_test = results(X,y, X_test, y_test, solution, gamma)
    return error_train, error_test

LSSVM(X,y, X_test,y_test,10, 1)
LSSVM(X,y, X_test,y_test,100, 1)
LSSVM(X,y, X_test,y_test,1000, 1)
LSSVM(X,y, X_test,y_test,10000, 1)


def rbf_svm(X,y, X_test, y_test,gamma, c):
    clf = svm.SVC(gamma=gamma, kernel='rbf', C=c)
    clf.fit(X, y)
    pred_train_reg=clf.predict(X)
    y_pred_reg_test=clf.predict(X_test)
    svm_train_error = 1-metrics.accuracy_score(y, pred_train_reg)
    svm_test_error = 1-metrics.accuracy_score(y_test, y_pred_reg_test)
    return svm_train_error, svm_test_error


def linear_svm(X,y, X_test, y_test,gamma, c):
    clf = svm.SVC(gamma=gamma, kernel='linear', C=c)
    clf.fit(X, y)
    pred_train_reg=clf.predict(X)
    y_pred_reg_test=clf.predict(X_test)
    svm_train_error = 1-metrics.accuracy_score(y, pred_train_reg)
    svm_test_error = 1-metrics.accuracy_score(y_test, y_pred_reg_test)
    return svm_train_error, svm_test_error


def poly_svm(X,y, X_test, y_test,gamma, c):
    clf = svm.SVC(gamma=gamma, kernel='poly', C=c)
    clf.fit(X, y)
    pred_train_reg=clf.predict(X)
    y_pred_reg_test=clf.predict(X_test)
    svm_train_error = 1-metrics.accuracy_score(y, pred_train_reg)
    svm_test_error = 1-metrics.accuracy_score(y_test, y_pred_reg_test)
    return svm_train_error, svm_test_error







def make_gamma_plot(X,y, X_test, y_test, C):
    error_lssvm_test = np.zeros(7)
    error_lssvm_train= np.zeros(7)
    error_linsvm_train = np.zeros(7)
    error_linsvm_test = np.zeros(7)
    error_rbfsvm_train = np.zeros(7)
    error_rbfsvm_test = np.zeros(7)
    error_polysvm_train = np.zeros(7)
    error_polysvm_test = np.zeros(7)

    names = ['LS-SVM-Train', 'LS-SVM-Test', 'Rbf-SVM-Train','Rbf-SVM-Test','Linear-SVM-Train','Linear-SVM-Test','Polynomial-SVM-Train','Polynomial-SVM-Test']


    gamma = 10**-4
    for i in range(7):
        error_lssvm_train[i], error_lssvm_test[i] = LSSVM(X,y, X_test, y_test, gamma, C)
        error_linsvm_train[i], error_linsvm_test[i] = linear_svm(X,y, X_test, y_test, gamma, C)
        error_polysvm_train[i], error_polysvm_test[i] = poly_svm(X,y, X_test, y_test, gamma, C)
        error_rbfsvm_train[i], error_rbfsvm_test[i] = rbf_svm(X,y, X_test, y_test, gamma, C)




        gamma = gamma * 10
    print(error_lssvm_train)
    df = pd.DataFrame(
    {'LS-SVM-Train': error_lssvm_train,
     'LS-SVM-Test': error_lssvm_test,
     'Rbf-SVM-Train': error_rbfsvm_train,
     'Rbf-SVM-Test': error_rbfsvm_test,
     'Linear-SVM-Train': error_linsvm_train,
     'Linear-SVM-Test': error_linsvm_test,
     'Polynomial-SVM-Train': error_polysvm_train,
     'Polynomial-SVM-Test': error_polysvm_test
        
    })
    df.boxplot()
    plt.xticks(rotation=90)
    
    print(df)
    percentile_list = pd.DataFrame(
    {'lst1Title': error_lssvm_train,
     'lst2Title': error_linsvm_train,
     'lst3Title': error_polysvm_train
    })

    
def make_c_plot(X,y, X_test, y_test, gamma):
    error_lssvm_test = np.zeros(7)
    error_lssvm_train= np.zeros(7)
    error_linsvm_train = np.zeros(7)
    error_linsvm_test = np.zeros(7)
    error_rbfsvm_train = np.zeros(7)
    error_rbfsvm_test = np.zeros(7)
    error_polysvm_train = np.zeros(7)
    error_polysvm_test = np.zeros(7)

    names = ['LS-SVM-Train', 'LS-SVM-Test', 'Rbf-SVM-Train','Rbf-SVM-Test','Linear-SVM-Train','Linear-SVM-Test','Polynomial-SVM-Train','Polynomial-SVM-Test']


    C = 10**-4
    for i in range(7):
        error_lssvm_train[i], error_lssvm_test[i] = LSSVM(X,y, X_test, y_test, gamma, C)
        error_linsvm_train[i], error_linsvm_test[i] = linear_svm(X,y, X_test, y_test, gamma, C)
        error_polysvm_train[i], error_polysvm_test[i] = poly_svm(X,y, X_test, y_test, gamma, C)
        error_rbfsvm_train[i], error_rbfsvm_test[i] = rbf_svm(X,y, X_test, y_test, gamma, C)




        C = C * 10
    print(error_lssvm_train)
    df = pd.DataFrame(
    {'LS-SVM-Train': error_lssvm_train,
     'LS-SVM-Test': error_lssvm_test,
     'Rbf-SVM-Train': error_rbfsvm_train,
     'Rbf-SVM-Test': error_rbfsvm_test,
     'Linear-SVM-Train': error_linsvm_train,
     'Linear-SVM-Test': error_linsvm_test,
     'Polynomial-SVM-Train': error_polysvm_train,
     'Polynomial-SVM-Test': error_polysvm_test
        
    })
    df.boxplot()
    plt.xticks(rotation=90)
    
        




make_gamma_plot(X,y, X_test,y_test,1)
make_c_plot(X,y, X_test,y_test,1)


FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import copy
get_ipython().run_line_magic('matplotlib', 'inline')

import sklearn
from sklearn.datasets import fetch_california_housing

plt.rcParams['font.size'] = 14


# In[14]:


# Download data

tmp = sklearn.datasets.fetch_california_housing()

num_samples   = tmp['data'].shape[0]
feature_names = tmp['feature_names']
y = tmp['target']
X = tmp['data']

data = {}
for n, feature in enumerate(feature_names):
    data[feature] = tmp['data'][:,n]
    


# In[15]:


# Create stumps

# bin the data by proportion, 10% in each bin
bins = {}
bin_idx = (np.arange(0,1.1,0.1)*num_samples).astype(np.int16)
bin_idx[-1] = bin_idx[-1]-1
for feature in (feature_names):
    bins[feature] = np.sort(data[feature])[bin_idx]

# decision stumps as weak classifiers
# 0 if not in bin, 1 if in bin
stumps = {}
for feature in feature_names:
    stumps[feature] = np.zeros([num_samples,len(bins[feature])-1])
    for n in range(len(bins[feature])-1):
        stumps[feature][:,n] = data[feature]>bins[feature][n]

# stack the weak classifiers into a matrix
H = np.hstack([stumps[feature] for feature in feature_names])
H = np.hstack([np.ones([num_samples,1]),H])
# prepare the vector for storing weights
alphas = np.zeros(H.shape[1])
bins[feature_names[1]]


# ### AdaBoost

# In[16]:


num_iterations = 30
MSE = np.zeros(num_iterations) # track mean square error


# In[17]:


for iteration in range(num_iterations):
    f = np.dot(H,alphas)  # the current f(x)
    r = y-f; MSE[iteration] = np.mean(r**2) # r = residual
    s = 2/20640 * np.absolute(np.dot(r,H))          
    idx = np.argmax(s)# optimal direction to move in
    alphas[idx] = alphas[idx] + (np.dot(H[:,idx],r) / np.dot(H[:,idx].T, H[:,idx])) # amount to move in optimal direction


# ### Plot Results

# In[18]:


alphasf = {}
start = 1
for feature in feature_names:
    alphasf[feature] = alphas[start:(start+stumps[feature].shape[1])]
    start = start + stumps[feature].shape[1]
alphasf['mean'] = alphas[0]

alphasf[feature_names[0]]
stumps[feature_names[0]].shape


# In[19]:


for feature in feature_names:
    plt.close("all")
    plt.plot(data[feature],y-np.mean(y),'.',alpha=0.5,color=[0.9,0.9,0.9])
   
    plt.title(feature)
    plt.xlim([bins[feature][0],bins[feature][-2]])
    plt.xlabel(feature)
    plt.ylabel('contribution to house price')
    plt.show()


# In[20]:


for feature in feature_names:
    plt.close("all")
    plt.plot(data[feature],y-np.mean(y),'.',alpha=0.5,color=[0.9,0.9,0.9])
    plt.plot(data[feature],np.dot(stumps[feature], alphasf[feature]) -np.mean(np.dot(stumps[feature], alphasf[feature])), ".", alpha = .5)
    #plt.step(bins[feature][:10], alphasf[feature])
    

    plt.title(feature)
    plt.xlim([bins[feature][0],bins[feature][-2]])
    plt.xlabel(feature)
    plt.ylabel('contribution to house price')
    plt.show()


# ### Variable Importance

# In[21]:


def create_bins(X, y, data):
    # bin the data by proportion, 10% in each bin
    bins = {}
    bin_idx = (np.arange(0,1.1,0.1)*num_samples).astype(np.int16)
    bin_idx[-1] = bin_idx[-1]-1
    for feature in (feature_names):
        bins[feature] = np.sort(data[feature])[bin_idx]

    # decision stumps as weak classifiers
    # 0 if not in bin, 1 if in bin
    stumps = {}
    for feature in feature_names:
        stumps[feature] = np.zeros([num_samples,len(bins[feature])-1])
        for n in range(len(bins[feature])-1):
            stumps[feature][:,n] = data[feature]>bins[feature][n]

    # stack the weak classifiers into a matrix
    H = np.hstack([stumps[feature] for feature in feature_names])
    H = np.hstack([np.ones([num_samples,1]),H])
    # prepare the vector for storing weights
    alphas = np.zeros(H.shape[1])
    bins[feature_names[1]]
    
    

   
    num_iterations = 30
    MSE = np.zeros(num_iterations)
    for iteration in range(num_iterations):
        f = np.dot(H,alphas)  # the current f(x)
        r = y-f; MSE[iteration] = np.mean(r**2) # r = residual
        s = 2/20640 * np.absolute(np.dot(r,H))          
        idx = np.argmax(s)# optimal direction to move in
        alphas[idx] = alphas[idx] + (np.dot(H[:,idx],r) / np.dot(H[:,idx].T, H[:,idx])) # amount to move in optimal direction
    print(MSE[29])


# In[22]:


for feature in feature_names:
    new_data = copy.deepcopy(data)
    new_data[feature] = np.random.permutation(new_data[feature])
    create_bins(X,y, new_data)

    


# ### Boosted Decision Trees

# In[44]:


from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence


# In[45]:


clf = GradientBoostingRegressor(loss="ls")
clf.fit(X,y)


# In[46]:


plt.close("all")
plt.figure(figsize=[10,10])
ax = plt.gca()
plot_partial_dependence(clf, X, feature_names, feature_names, n_cols=3, ax=ax) 
plt.tight_layout()
plt.show()


# ### Linear Regression

# In[47]:


from sklearn.linear_model import LinearRegression


# In[48]:


clf2 = LinearRegression()
clf2.fit(X,y)


# #### Comparison in MSE

# In[49]:


np.mean((y-clf2.predict(X))**2)


# In[50]:


np.mean((y-clf.predict(X))**2)
