In [11]:
import numpy as np
from scipy.stats import norm
from scipy.stats import multivariate_normal
data = np.array([
    [1, 1, 0],
    [1, 1, 5],
    [0, 2, 4],
    [1, 2, 3],
    [2, 0, 7],
    [1, 1, 1],
    [2, 0, 2],
    [0, 2, 9],
    [2, 0, 0],
    [1, 2, 1]
])
target = np.array([1, 3, 2, 0, 6, 4, 5, 7, 2, 4])
data_train = data[:8,:]
data_test = data[8:,:]
target_train = target[:8]
target_test = target[8:]
#data = np.array([[41.9, 29.1],[43.4, 29.3],[43.9, 29.5],[44.5, 29.7],[47.3, 29.9],[47.5, 30.3],[47.9, 30.5],[50.2, 30.7],[52.8, 30.8],[53.2, 30.9],[56.7, 31.5],[57.0, 31.7],[63.5, 31.9],[65.3, 32.0],[71.1, 32.1],[77.0, 32.5],[77.8, 32.9])
#target = np.array([251.3, 251.3, 248.3, 267.5, 273.0, 276.5, 270.3, 274.9, 285.0, 290.0, 297.0, 302.5, 304.5, 309.3, 321.7, 330.7, 349.0])
#data_train = data
#target_train = target

# Polynomial Regression

## Basis Function

In [12]:
def basis_function(data,i,j):
    if j == 0: return 1
    #return data[i][j-1]
    #print(data[i])
    return np.linalg.norm(data[i])**j

## Design Matrix

In [13]:
M = 3
n = len(data_train)
design_matrix = np.array([[1 for i in range(M+1)] for j in range(n)], dtype=np.float32)
for i in range(n):
    for j in range(M+1):
        design_matrix[i][j] = np.round(basis_function(data_train, i, j), decimals=3)    
print("Design Matrix\n",design_matrix)

Design Matrix
 [[  1.      1.414   2.      2.828]
 [  1.      5.196  27.    140.296]
 [  1.      4.472  20.     89.443]
 [  1.      3.742  14.     52.383]
 [  1.      7.28   53.    385.846]
 [  1.      1.732   3.      5.196]
 [  1.      2.828   8.     22.627]
 [  1.      9.22   85.    783.661]]


## Weights

In [14]:
weights = np.array([1 for j in range(n)], dtype=np.float32)
#print(np.matmul(design_matrix.T,design_matrix))
weights = np.matmul(np.matmul(np.linalg.inv(np.matmul(design_matrix.T,design_matrix)),design_matrix.T),target_train)
weights = np.round(weights,decimals=4)
print("w -> ", weights)

w ->  [ 4.5939 -1.6959  0.3396 -0.0134]


In [15]:
import math
RMSE = 0
for i in range(len(data_train)):
    func_xi = np.round(np.array([basis_function(data_train, i, j) for j in range(M+1)]),decimals=3)
    #print(func_xi)
    #print(np.round(np.matmul(func_xi.T, weights),decimals=3), target_train[i])
    #print(np.round((target_train[i] - np.matmul(func_xi.T, weights))**2,decimals=3))
    RMSE += np.round((target_train[i] - np.matmul(func_xi.T, weights))**2,decimals=3)   
RMSE = math.sqrt(RMSE/n)
print("RMSE ->", RMSE)

RMSE -> 1.5697133496278868


# Decision Tree

In [16]:
split = np.median(data_train[:,2])
for i in range(len(data_train)):
    if target_train[i] < 4: target_train[i] = 0
    else: target_train[i] = 1
    if data_train[i,2] <= split: data_train[i,2] = 0
    else: data_train[i,2] = 1

In [17]:
from math import log2
from scipy.stats import entropy

def select_feature(data_train,target_train, variables):
    p_c0 = np.count_nonzero(target_train == 0)/len(target_train)
    p_c1 = np.count_nonzero(target_train == 1)/len(target_train)

    c_entropy = entropy([p_c0,p_c1], base=2)

    filters = []
    probs = []
    for i in range(len(variables)):
        filters.append([])
        probs.append([])
        for cl in variables[i]:
            filters[i].append([data_train[:,i][j] == cl for j in range(len(data_train))])
            probs[i].append(np.count_nonzero(data_train[:,i] == cl)/len(data_train))

 
    probs_cond = []
    for i in range(len(variables)):
        probs_cond.append([])
        for j in range(len(variables[i])):
            probs_cond[i].append([])
            probs_cond[i][j].append(np.count_nonzero(target_train[filters[i][j]] == 0)/len(target_train[filters[i][j]]))
            probs_cond[i][j].append(np.count_nonzero(target_train[filters[i][j]] == 1)/len(target_train[filters[i][j]]))

    InfoG = []
    for i in range(len(variables)):
        cond_entropy = 0
        for j in range(len(variables[i])):
            cond_entropy += probs[i][j]*entropy(probs_cond[i][j], base=2)
        InfoG.append(c_entropy-cond_entropy)
    print(InfoG)
    return InfoG.index(max(InfoG)), max(InfoG)

def get_variables(data):
    variables = []
    for i in range(len(data[0])):
        variables.append([])
        for j in range(len(data)):
            if not data[j,i] in variables[i]:
                variables[i].append(data[j,i])
    return variables

variables = get_variables(data_train)
res = select_feature(data_train, target_train, variables)
print(res)


[0.34436093777043353, 0.31127812445913294, 0.0]
(0, 0.34436093777043353)
