In [1]:
import csv
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

### Problem 1.1

In [2]:
#  From .csv file, read headers, then load numbers into array x
#  Warning: unfortunately there are many types of .csv files
#  This code assumes the .csv file has comma separate headings in the first row
#  and comma separated numbers in the remaining rows.
def data_load(string):
    csv_path = string
    with open(csv_path,'rt') as csvfile:  #After code under "with open as" is completed, csvfile is closed
        reader=csv.reader(csvfile)
        headings=next(reader)
        print ("Reading csv file with headers:\n  ","\n   ".join(headings),"\n")
        x=[]
        for row in reader:
            x.append(row)    
    return(np.array(x,dtype=float).T)   # returns data with one column for each multidimensional sample

In [3]:
x = data_load("HTRU_2.csv")

Reading csv file with headers:
   MeanIP
   StdIP
   KurtosisIP
   SkewIP
   MeanDM
   StdDM
   KurtosisDM
   SkewDM
   Class 



In [4]:
x = x.transpose()

In [5]:
kf = KFold(n_splits=5)

In [6]:
table = []

# SVM
SVM_pa_C = [0.1, 1, 10]

for pa_C in SVM_pa_C:
    
    clf = LinearSVC(C=pa_C, dual=False)
    accuracy = []
    precision = []
    recall = []
    
    for train_index, test_index in kf.split(x):
        clf.fit(x[train_index, :8], x[train_index, 8])
        y_pred = clf.predict(x[test_index, :8])
        accuracy.append(accuracy_score(x[test_index, 8], y_pred))
        precision.append(precision_score(x[test_index, 8], y_pred))
        recall.append(recall_score(x[test_index, 8], y_pred))
    
    table.append([np.mean(accuracy), np.mean(precision), np.mean(recall), np.std(accuracy), np.std(precision), np.std(recall)])
    
# Decision Tree
dt_max_depth = [3,4,6]

for pa_max_depth in dt_max_depth:
    
    clf = DecisionTreeClassifier(max_depth=pa_max_depth)
    accuracy = []
    precision = []
    recall = []
    
    for train_index, test_index in kf.split(x):
        clf.fit(x[train_index, :8], x[train_index, 8])
        y_pred = clf.predict(x[test_index, :8])
        accuracy.append(accuracy_score(x[test_index, 8], y_pred))
        precision.append(precision_score(x[test_index, 8], y_pred))
        recall.append(recall_score(x[test_index, 8], y_pred))
    
    table.append([np.mean(accuracy), np.mean(precision), np.mean(recall), np.std(accuracy), np.std(precision), np.std(recall)])
    
# Random Forest
num_tree = [5,11,13]

for pa_num_tree in num_tree:
    
    clf = RandomForestClassifier(n_estimators=pa_num_tree, max_depth=5)
    accuracy = []
    precision = []
    recall = []
    
    for train_index, test_index in kf.split(x):
        clf.fit(x[train_index, :8], x[train_index, 8])
        y_pred = clf.predict(x[test_index, :8])
        accuracy.append(accuracy_score(x[test_index, 8], y_pred))
        precision.append(precision_score(x[test_index, 8], y_pred))
        recall.append(recall_score(x[test_index, 8], y_pred))
    
    table.append([np.mean(accuracy), np.mean(precision), np.mean(recall), np.std(accuracy), np.std(precision), np.std(recall)])

In [7]:
col_name = ['mean accuracy', 'mean precision', 'mean recall', 'accuracy std', 'precision std', 'recall std']
idx_name = ['Linear SVM with C = 0.1', 'Linear SVM with C = 1', 'Linear SVM with C = 10', 
            'Decision Tree with max_depth = 3', 'Decision Tree with max_depth = 4','Decision Tree with max_depth = 6',
            'Random Forest with num_tree = 5', 'Random Forest with num_tree = 11', 'Random Forest with num_tree = 13']
pd.DataFrame(table, columns=col_name, index=idx_name)

Unnamed: 0,mean accuracy,mean precision,mean recall,accuracy std,precision std,recall std
Linear SVM with C = 0.1,0.977373,0.897046,0.780514,0.008142,0.111252,0.063226
Linear SVM with C = 1,0.977708,0.898326,0.782744,0.008086,0.111765,0.0645
Linear SVM with C = 10,0.977708,0.898326,0.782744,0.008086,0.111765,0.0645
Decision Tree with max_depth = 3,0.977987,0.848144,0.840712,0.006209,0.120281,0.042894
Decision Tree with max_depth = 4,0.978602,0.866008,0.823791,0.006496,0.117845,0.05656
Decision Tree with max_depth = 6,0.978657,0.861063,0.816999,0.005731,0.127188,0.076631
Random Forest with num_tree = 5,0.977317,0.873871,0.781326,0.006173,0.133071,0.086046
Random Forest with num_tree = 11,0.977372,0.880527,0.788608,0.006907,0.107719,0.077544
Random Forest with num_tree = 13,0.977987,0.883338,0.793359,0.006522,0.111242,0.071455


### Problem 1.2

$
\text{In this dataset, there are $1639$ positive examples and $16259$ negative examples, so the ratio of negative examples is}\\
\text{approximately $90.84\%$, which is really high. As a result, even if we blindly predict each example to be negative, we can}\\
\text{still get an accuracy of $0.9084$ that is higher than all the mean precisions and mean recalls in the above table. In other}\\
\text{words, it is really easy to get true negative while hard to get true positive. Since precision and recall are based on}\\
\text{predicted positive and actual positive, respectively, and that accuracy is based on the overall examples where negative}\\
\text{examples have a ratio of about $90.84\%$, it is reasonable that accuracy has a high value compared to precision and recall.}
$

### Problem 1.3

$
\text{In my opinion, the decision-tree classifier with max_depth=3 performs the best. Accuracy of each classifier is really close to}\\
\text{each other and really high, and it is relatively easy to get a high accuracy according to problem 1.2. Thus, mean accuracy should}\\
\text{not be the key metrics to judge which classifier is the best. Since the ratio of positive examples in this dataset is just about}\\
\text{$9.16\%$, which is really low, the trained classifier will tend to predict negative. Therefore, I think the capability to have the}\\ 
\text{actual positive examples predicted as positive should be emphasized. In other words, mean recall should be the key metrics to juege}\\ 
\text{which classifier is the best. As a result, the decision-tree classifier with max_depth=3 should be the best one, because it has the}\\ 
\text{highest mean recall value and its mean recall value is considerably higher than other classifiers' mean recall values. Even though}\\ 
\text{the mean precision of the decision-tree classifier with max_depth=3 is the lowest among these classifiers, it is still high enough}\\ 
\text{to allow it to be the best. To conclude, the decision-tree classifier with max_depth=3 performs the best, because it has the highest}\\
\text{mean recall value, equivalently, the strongest capability to have the actural positive examples predicted as positive.}
$

### Problem 2.1

$
z_1 = x\cdot w_1 + b_1,\ a_1 = \sigma(z_1)\\
z_4 = a_1\cdot w_4 + a_2\cdot w_5 + a_3\cdot w_6 + b_4,\ a_4 = \sigma(z_4)\\
\hat{y} = a_4 = \sigma(z_4) = \sigma(a_1\cdot w_4 + a_2\cdot w_5 + a_3\cdot w_6 + b_4)\\
\ \ = \sigma(\sigma(z_1)\cdot w_4 + \sigma(z_2)\cdot w_5 + \sigma(z_3)\cdot w_6 + b_4)\\
\ \ = \sigma(\sigma(x_1\cdot w_1 + b_1)\cdot w_4 + \sigma(x_1\cdot w_2 + b_2)\cdot w_5 
+ \sigma(x_1\cdot w_3 + b_3)\cdot w_6 + b_4)\\
$

### Problem 2.2

$
L_i = (\hat{y}_i - y_i)^2\\
\frac{\partial L_i}{\partial w_1} = \frac{\partial L_i}{\partial \hat{y}_i} \frac{\partial \hat{y}_i}{\partial a_4}
\frac{\partial a_4}{\partial z_{4i}} \frac{\partial z_{4i}}{\partial a_1} \frac{\partial a_1}{\partial z_{1i}} 
\frac{\partial z_{1i}}{\partial w_1}\\
\frac{\partial L_i}{\partial w_4} = \frac{\partial L_i}{\partial \hat{y}_i} \frac{\partial \hat{y}_i}{\partial a_4}
\frac{\partial a_4}{\partial z_{4i}} \frac{\partial z_{4i}}{\partial w_4}\\
\frac{\partial L_i}{\partial b_1} = \frac{\partial L_i}{\partial \hat{y}_i} \frac{\partial \hat{y}_i}{\partial a_4}
\frac{\partial a_4}{\partial z_{4i}} \frac{\partial z_{4i}}{\partial a_1} \frac{\partial a_1}{\partial z_{1i}} 
\frac{\partial z_{1i}}{\partial b_1}\\
\frac{\partial L_i}{\partial b_4} = \frac{\partial L_i}{\partial \hat{y}_i} \frac{\partial \hat{y}_i}{\partial a_4}
\frac{\partial a_4}{\partial z_{4i}} \frac{\partial z_{4i}}{\partial b_4}\\
$

$
\because \frac{\partial L_i}{\partial \hat{y}_i} = 2 (\hat{y}_i-y_i),\
\frac{\partial \hat{y}_i}{\partial a_4} = 1\\
\because \frac{\partial a_4}{\partial z_{4i}} = \sigma(z_{4i})(1-\sigma(z_{4i})),\
\frac{\partial z_{4i}}{\partial a_1} = w_4\\
\because \frac{\partial a_1}{\partial z_{1i}} = \sigma(z_{1i})(1-\sigma(z_{1i})),\
\frac{\partial z_{1i}}{\partial w_1} = x_i\\
\because \frac{\partial z_{4i}}{\partial w_4} = a_1 = \sigma(z_{1i}),\
\frac{\partial z_{1i}}{\partial b_1} = 1,\
\frac{\partial z_{4i}}{\partial b_4} = 1
$

$
\therefore \frac{\partial L_i}{\partial w_1} = 2 (\hat{y}_i-y_i)\cdot \sigma(z_{4i})(1-\sigma(z_{4i}))\cdot w_4\cdot 
\sigma(z_{1i})(1-\sigma(z_{1i}))\cdot x_i\\
\therefore \frac{\partial L_i}{\partial w_4} = 2 (\hat{y}_i-y_i)\cdot \sigma(z_{4i})(1-\sigma(z_{4i}))\cdot \sigma(z_{1i})\\
\therefore \frac{\partial L_i}{\partial b_1} = 2 (\hat{y}_i-y_i)\cdot \sigma(z_{4i})(1-\sigma(z_{4i}))\cdot w_4\cdot 
\sigma(z_{1i})(1-\sigma(z_{1i}))\\
\therefore \frac{\partial L_i}{\partial b_4} = 2 (\hat{y}_i-y_i)\cdot \sigma(z_{4i})(1-\sigma(z_{4i}))
$

### Problem 2.3

$
w_1^+ = w_1 - \eta \frac{\partial L_1}{\partial w_1}\\
\text{from Problem 2.2, we have}\\
w_1^+ = w_1 - \eta \frac{\partial L_1}{\partial \hat{y}_1} \frac{\partial \hat{y}_1}{\partial a_4}
\frac{\partial a_4}{\partial z_{41}} \frac{\partial z_{41}}{\partial a_1} \frac{\partial a_1}{\partial z_{11}} 
\frac{\partial z_{11}}{\partial w_1}\\
\ \ \ \ \ = w_1 - \eta \cdot [2 (\hat{y}_1-y_1)\cdot \sigma(z_{41})(1-\sigma(z_{41}))\cdot w_4\cdot 
\sigma(z_{11})(1-\sigma(z_{11}))\cdot x_1]
$

### Problem 2.4

$
w_1^+ = w_1 - \eta \frac{\partial L}{\partial w_1} = w_1 - \eta \cdot \frac{1}{n} \sum_{i=1}^{n}\frac{\partial L_i}{\partial w_1}\\
\text{from Problem 2.2, we have}\\
w_1^+ = w_1 - \eta \cdot \frac{1}{n} \sum_{i=1}^{n}\frac{\partial L_i}{\partial w_1}
= w_1 - \eta \cdot \frac{1}{n} \sum_{i=1}^{n}
\frac{\partial L_i}{\partial \hat{y}_i} \frac{\partial \hat{y}_i}{\partial a_4}
\frac{\partial a_4}{\partial z_{4i}} \frac{\partial z_{4i}}{\partial a_1} \frac{\partial a_1}{\partial z_{1i}} 
\frac{\partial z_{1i}}{\partial w_1}\\
\ \ \ \ \ = w_1 - \frac{2\eta}{n} \sum_{i=1}^{n}  (\hat{y}_i-y_i)\cdot \sigma(z_{4i})(1-\sigma(z_{4i}))\cdot w_4\cdot 
\sigma(z_{1i})(1-\sigma(z_{1i}))\cdot x_i
$

### Problem 2.5

$
\text{Sigmoid activation}\\
\bullet \text{advantage: It does not blow up activation, because the sigmoid function is bounded in $[0,1]$.}\\
\bullet \text{disadvantage: It has the vanishing gradient problem in deep networks, because the derivative of sigmoid function is}\\
\ \ \ \text{almost zero when the input is greater than $6$ or less than $-6$.}\\
\text{ReLU activation}\\
\bullet \text{advantage: It is more computationally efficient than sigmoid activation, because it just needs to pick $max(0,z)$,}\\
\ \ \ \text{instead of performing expensive exponential computations as in sigmoid activation.}\\
\bullet \text{disadvantage: It tends to blow up activation, because its value is not restricted and can be arbitrarily large.}
$