In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)


DATASETS_DIR = Path('datasets')


def load_cancer_dataset():
    df = pd.read_csv(DATASETS_DIR / 'cancer.csv')
    y = df['label'].to_numpy()
    X = df.drop('label', axis=1).to_numpy()
    return X, y


def load_blobs_dataset():
    df = pd.read_csv(DATASETS_DIR / 'blobs.csv')
    return df.to_numpy()


def load_spam_dataset():
    df = pd.read_csv(DATASETS_DIR / 'spam.csv')
    y = df['label'].to_numpy()
    X = df.drop('label', axis=1).to_numpy()
    return X, y

In [2]:
def draw_table(title, rows, columns, data, layout_height=None):
    if rows:
        header = np.array(['', *columns])
        data = np.array([rows, *np.array(data).T])
    else:
        header = np.array(columns)
        data = np.array(data).T
    
    layout = go.Layout(
        title=title,
        height=layout_height
    )
    table = go.Table(
        header=dict(
            values=header.reshape(-1, 1),
            align=['center']
        ),
        cells=dict(
            values=data.reshape(len(header), len(rows), 1),
            align=['center']
        )
    )
    
    figure = go.Figure(data=[table], layout=layout)
    py.iplot(figure)

In [3]:
def min_max_scale(X):
    return (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

# kNN with Leave-one-out error

In [4]:
def knn_loo_err(X, Y, k, scaler=None):
    if scaler:
        X = scaler(X)
    _, Y = np.unique(Y, return_inverse=True)
    
    loo_acc = np.zeros(k)
    for x, y in zip(X, Y):
        dist = np.column_stack((np.linalg.norm(X - x, axis=1), Y))
        dist = dist[np.argpartition(dist[:,0], k)]
        dist = dist[np.argsort(dist[:k+1,0])[1:]]
        for i in range(k):
            pred = np.argmax(np.bincount(dist[:i+1,1].astype(int)))
            loo_acc[i] += int(pred == y)
    return 1 - loo_acc / X.shape[0]

In [5]:
max_k = 10
k = 1 + np.arange(max_k)

data = [f'{err:.5f}' for err in  knn_loo_err(*load_cancer_dataset(), max_k)]
draw_table('LOO for Cancer Dataset', ['LOO error'], k, data, layout_height=250)

data = [f'{err:.5f}' for err in  knn_loo_err(*load_spam_dataset(), max_k)]
draw_table('LOO for Spam Dataset', ['LOO error'], k, data, layout_height=250)

data = [f'{err:.5f}' for err in  knn_loo_err(*load_cancer_dataset(), max_k, scaler=min_max_scale)]
draw_table('LOO for normalized Cancer Dataset', ['LOO error'], k, data, layout_height=250)

data = [f'{err:.5f}' for err in  knn_loo_err(*load_spam_dataset(), max_k, scaler=min_max_scale)]
draw_table('LOO for normalized Spam Dataset', ['LOO error'], k, data, layout_height=250)

# Clustering

In [6]:
def draw_clusters(title, X, clusters):
    u = np.unique(clusters)
    p = [X[clusters == c] for c in u]
    
    name = lambda c : f'cluster #{c}' if c > 0 else 'noise'
    data = [go.Scatter(x=p[i][:,0], y=p[i][:,1], mode='markers', name=name(c)) for i, c in enumerate(u)]
    layout = go.Layout(title=title)
    figure = go.Figure(data=data, layout=layout)
    py.iplot(figure)

In [7]:
def get_loss(X, clusters):
    k = len(np.unique(clusters))
    center_of_mass = np.array([X[clusters == c].sum(axis=0) / sum(clusters == c) for c in range(k)])
    
    loss = 0
    for x, c in zip(X, clusters):
        loss += np.linalg.norm(x - center_of_mass[c])**2
    return loss

### k-means

In [8]:
def k_means(X, k, verbose=False):
    N = len(X)
    clusters = np.random.randint(low=0, high=k, size=N)
    
    num_of_points = np.array([sum(clusters == c) for c in range(k)]).reshape(k, 1)
    sum_of_mass = np.array([X[clusters == c].sum(axis=0) for c in range(k)])
    
    iteration = 0
    loss = get_loss(X, clusters)
    while True:
        iteration += 1
        if iteration % 1000 == 0:
            new_loss = get_loss(X, clusters)
            if verbose:
                print(f'clusters={k}, iteration={iteration}, loss={new_loss}')
            if loss - new_loss < 1e-3:
                break
            loss = new_loss
        
        i = np.random.choice(N)
        x = X[i]
        
        dist = np.linalg.norm(sum_of_mass / num_of_points - x, axis=1)
        j = np.argmin(dist)

        if j != clusters[i]:
            sum_of_mass[clusters[i]] -= x
            num_of_points[clusters[i]] -= 1
            clusters[i] = j
            sum_of_mass[clusters[i]] += x
            num_of_points[clusters[i]] += 1
        
    return clusters


def test_k_means(X):
    for k in range(2, 6):
        clusters = k_means(X, k, True)
        draw_clusters(f'k-means for {k} clusters', X, clusters + 1)

In [9]:
test_k_means(load_blobs_dataset())

clusters=2, iteration=1000, loss=439.46488821005056
clusters=2, iteration=2000, loss=386.0793728414442
clusters=2, iteration=3000, loss=385.87887619451396
clusters=2, iteration=4000, loss=385.32288453516503
clusters=2, iteration=5000, loss=385.32288453516503


clusters=3, iteration=1000, loss=302.08057733554074
clusters=3, iteration=2000, loss=207.66438764923166
clusters=3, iteration=3000, loss=199.3042894827296
clusters=3, iteration=4000, loss=199.23568023168872
clusters=3, iteration=5000, loss=199.23568023168872


clusters=4, iteration=1000, loss=241.9047714869347
clusters=4, iteration=2000, loss=142.5121931792548
clusters=4, iteration=3000, loss=135.59744710091113
clusters=4, iteration=4000, loss=135.59744710091113


clusters=5, iteration=1000, loss=174.97774766433244
clusters=5, iteration=2000, loss=122.82193605722415
clusters=5, iteration=3000, loss=119.50586473470594
clusters=5, iteration=4000, loss=119.15438680579133
clusters=5, iteration=5000, loss=119.07651478315519
clusters=5, iteration=6000, loss=118.97446687499252
clusters=5, iteration=7000, loss=118.63669570242217
clusters=5, iteration=8000, loss=118.45226729308662
clusters=5, iteration=9000, loss=118.45226729308662


### DBSCAN

In [10]:
CONNECTED = 1
DENSITY_CONNECTED = 2

def select_cluster(v, graph, clusters, cluster_num):
    clusters[v] = cluster_num
    for u, c in enumerate(clusters):
        if clusters[u] == cluster_num:
            continue
        if graph[v,u] == CONNECTED:
            select_cluster(u, graph, clusters, cluster_num)
        elif graph[v,u] == DENSITY_CONNECTED:
            clusters[u] = cluster_num


def dbscan(X, eps, m):
    N = len(X)
    graph = np.zeros((N, N)).astype(np.int8)
    clusters = np.zeros(N).astype(np.int32)
    is_core = np.zeros(N).astype(np.bool8)

    for v in range(N):
        dist = np.linalg.norm(X - X[v], axis=1)
        is_core[v] = sum(dist < eps) > m
    
    for v in range(N):
        for u in range(N):
            if np.linalg.norm(X[u] - X[v]) < eps:
                graph[v,u] = CONNECTED if is_core[u] else DENSITY_CONNECTED
    
    cluster_num = 0
    for v in range(N):
        if is_core[v] and clusters[v] == 0:
            cluster_num += 1
            select_cluster(v, graph, clusters, cluster_num)
    
    return clusters

def test_dbscan(X):
    """
    for eps in np.linspace(0.2, 0.3, 41):
        for m in range(5, 30):
            print(f'eps={eps}, m={m}')
            clusters = dbscan(X, eps, m)
            k = np.max(np.unique(clusters))
            
            mx = np.max(np.bincount(clusters)[1:])
            mn = np.min(np.bincount(clusters)[1:])
            noise = sum(clusters == 0) / len(clusters)
            if noise > 0.4:
                break
            if 2 <= k <= 5:
                if k not in params:
                    params[k] = (mx / mn, eps, m)
                else:
                    params[k] = min(params[k], (mx / mn, eps, m))
    """
    params = [(2, 0.215, 6), (3, 0.295, 15), (4, 0.22, 8), (5, 0.2975, 18)]
    for k, eps, m in params:
        clusters = dbscan(X, eps, m)
        draw_clusters(f'dbscan with eps={eps}, m={m}', X, clusters)

In [11]:
test_dbscan(load_blobs_dataset())

### Agglomerative Clustering

In [12]:
def ward_metric(A, B):
    if len(A) == 1 and len(B) == 1:
        return np.linalg.norm(A[0] - B[0])**2
    if len(A) < len(B):
        A, B = B, A
    
    a, b = len(A), len(B)
    i, j = a // 2, a - a // 2
    
    return (i + b) / (a + b) * ward_metric(A[:i], B) + \
           (j + b) / (a + b) * ward_metric(A[i:], B) - \
           b / (a + b) * ward_metric(A[:i], A[i:])


def average_metric(A, B):
    dist = 0
    for a in A:
        dist += np.linalg.norm(B - a, axis=1).sum()
    return dist / len(A) / len(B)


def max_metric(A, B):
    dist = 0
    for a in A:
        dist = np.linalg.norm(B - a, axis=1).max()
    return dist

In [13]:
def agglomerative_clustering(X, metric, k):
    N = len(X)
    clusters = [np.array([i]) for i in range(N)]
    exist = np.array([True] * N).astype(np.bool)
    dist = np.zeros((N, N))
    
    for i in range(N):
        dist[i,i] = 1e9
        for j in range(i + 1, N):
            dist[i,j] = dist[j,i] = metric(X[clusters[i]], X[clusters[j]])
    
    for n in range(N, k, -1):
        p = np.argmin(dist[exist][:,exist])
        i = np.arange(N)[exist][p // n]
        j = np.arange(N)[exist][p % n]
        
        clusters[i] = np.hstack((clusters[i], clusters[j]))
        exist[j] = False
        
        for j in np.arange(N)[exist]:
            if i != j:
                dist[i,j] = dist[j,i] = metric(X[clusters[i]], X[clusters[j]])
    
    result = np.zeros(N).astype(int)
    for i, c in enumerate(np.arange(N)[exist]):
        result[clusters[c]] = i
    return result


def test_agglomerative_clustering(X, metric, metric_name):
    for k in range(2, 6):
        clusters = agglomerative_clustering(X, metric, k)
        draw_clusters(f'agglomerative clustering with {metric_name} metric for {k} clusters', X, clusters + 1)

In [14]:
test_agglomerative_clustering(load_blobs_dataset(), max_metric, '"max"')

In [15]:
test_agglomerative_clustering(load_blobs_dataset(), average_metric, '"average"')

In [16]:
test_agglomerative_clustering(load_blobs_dataset(), ward_metric, '"ward"')

### k-means with purity metric for cancer dataset

In [17]:
def k_means_purity(X, y):
    _, y = np.unique(y, return_inverse=True)
    result = []
    for k in [2, 3, 5, 10]:
        clusters = k_means(X, k)
        purity = 0
        for c in np.unique(clusters):
            purity += np.max(np.bincount(y[clusters == c]))
        purity /= len(X)
        result.append(purity)
    return result

In [18]:
data = [f'{p:.5f}' for p in k_means_purity(*load_cancer_dataset())]
draw_table('Clusterization purity for Cancer Dataset', ['purity'], [f'{k} clusters' for k in [2, 3, 5, 10]], data, 250)

# Rules and Curves

In [19]:
def draw_roc_curve(fprs, tprs, feature_nums, dataset_name):
    traces = [go.Scatter(x=fpr, y=tpr, mode='lines', name=f'feature #{feature}', xaxis=f'x{i+1}', yaxis=f'y{i+1}') 
             for i, (fpr, tpr, feature) in enumerate(zip(fprs, tprs, feature_nums))]
    layout = go.Layout(
        title=f'ROC curves for the {dataset_name} dataset',
        width=950, 
        height=400,
        xaxis1=dict(domain=[0.0, 0.3]),
        xaxis2=dict(domain=[0.35, 0.65]),
        xaxis3=dict(domain=[0.7, 1.0]),
        yaxis1=dict(domain=[0.0, 1.0], anchor='x1'),
        yaxis2=dict(domain=[0.0, 1.0], anchor='x2'),
        yaxis3=dict(domain=[0.0, 1.0], anchor='x3')
    )
    figure = go.Figure(data=traces, layout=layout)
    py.iplot(figure)


def find_best_features(X, Y, dataset_name):
    labels, y = np.unique(Y, return_inverse=True)
    scores = np.zeros(len(X.T))
    for i, x in enumerate(X.T):
        scores[i] = roc_auc_score(y, x)
        scores[i] = max(scores[i], 1 - scores[i])
    
    best_scores = np.argsort(scores)
    fprs, tprs, feature_nums = [], [], []
    for i in best_scores[-1:-4:-1]:
        fpr, tpr, threshold = roc_curve(y, X.T[i])
        fprs.append(fpr)
        tprs.append(tpr)
        feature_nums.append(i)
        print(f'feature #{i}: auc score={scores[i]}')
    draw_roc_curve(fprs, tprs, feature_nums, dataset_name)

In [20]:
find_best_features(*load_cancer_dataset(), 'Cancer')
find_best_features(*load_spam_dataset(), 'Spam')

feature #22: auc score=0.9754505575815232
feature #20: auc score=0.9704428941387877
feature #23: auc score=0.9698284974367105


feature #51: auc score=0.8290461207554874
feature #55: auc score=0.8041625681254704
feature #54: auc score=0.7882004153012556


# Validation

In [28]:
def dataset_split(dataset_name, X, y, random_state=None):
    labels, y = np.unique(y, return_inverse=True)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)
    
    data = np.array([
        [f'{p:.5f}' for p in np.bincount(y_train) / len(y_train)],
        [f'{p:.5f}' for p in np.bincount(y_val) / len(y_val)]
    ])
    draw_table(dataset_name, ['train', 'validation'], labels, data, 300)

In [29]:
dataset_split('Cancer Dataset', *load_cancer_dataset(), 13)
dataset_split('Spam Dataset', *load_spam_dataset(), 13)

# Trees

In [23]:
class DecisionTree:
    class Node:
        def __init__(self):
            self.left = None
            self.right = None
            self.feature = -1
            self.threshold = 0
            self.label = None
            self.probs = None
            
        def apply_labels(self, labels):
            if self.label is not None:
                self.label = labels[self.label]
            else:
                self.left.apply_labels(labels)
                self.right.apply_labels(labels)
        
        def predict(self, x):
            if self.feature == -1:
                return self.label
            return self.left.predict(x) if x[self.feature] < self.threshold else self.right.predict(x)
        
        def predict_probs(self, x):
            if self.feature == -1:
                return self.probs
            return self.left.predict_probs(x) if x[self.feature] < self.threshold else self.right.predict_probs(x)
        
        def print_node(self):
            print(f'node: feature_num={self.feature}, threshold={self.threshold}, label={self.label}')
            if self.left is not None:
                print('go left')
                self.left.print_node()
            if self.right is not None:
                print('go right')
                self.right.print_node()
            print('go up')
                
    class Criterion:
        EPS = 1e-10

        @staticmethod
        def gini(x, y):
            labels, y = np.unique(y, return_inverse=True)

            a = np.argsort(x)
            x_sort, y_sort = x[a], y[a]

            N, K, score = len(x), len(labels), 0
            p_left, p_right = np.zeros(len(labels)), np.zeros(len(labels))
            for c in range(K):
                p = (y == c).sum()
                p_right[c] = p
                score += (p / N) * (1 - p / N)

            best_score = -1
            best_threshold = -1

            left, right = 0, N
            while left < N - 1:
                p_right[y_sort[left]] -= 1
                p_left[y_sort[left]] += 1
                left += 1
                right -= 1

                left_score, right_score = 0, 0
                for c in range(K):
                    left_score += (p_left[c] / N) * (1 - p_left[c] / N)
                    right_score += (p_right[c] / N) * (1 - p_right[c] / N)

                new_score = score - (left / N) * left_score - (right / N) * right_score
                if x_sort[left] - x_sort[left - 1] > DecisionTree.Criterion.EPS and new_score > best_score:
                    best_score = new_score
                    best_threshold = x_sort[left]

            return best_threshold, best_score
    
    def __init__(self, max_depth=None):
        self.root = None
        self.max_depth = max_depth
        self.criterion = DecisionTree.Criterion.gini
    
    def _find_split(self, X, y):
        best_score = 0
        best_feature, best_threshold = -1, -1
        for i, x in enumerate(X.T):
            threshold, score = self.criterion(x, y)
            if score > best_score:
                best_score = score
                best_feature, best_threshold = i, threshold
        return best_feature, best_threshold
        
    def _fit(self, X, y, node, depth=0, v=0):
        labels, _y = np.unique(y, return_inverse=True)
        if len(labels) == 1 or (self.max_depth is not None and depth == self.max_depth):
            bins = np.bincount(_y)
            best_class = np.argmax(bins)
            node.label = labels[best_class]
            
            node.probs = np.zeros(2)
            node.probs[node.label] = bins[best_class] / sum(bins)
            node.probs[1 - node.label] = 1 - node.probs[node.label]
            return
        
        node.feature, node.threshold = self._find_split(X, y)
        if node.feature == -1:
            bins = np.bincount(_y)
            best_class = np.argmax(bins)
            node.label = labels[best_class]
            
            node.probs = np.zeros(2)
            node.probs[node.label] = bins[best_class] / sum(bins)
            node.probs[1 - node.label] = 1 - node.probs[node.label]
            return
        
        condition = X[:,node.feature] < node.threshold
        node.left = DecisionTree.Node()
        node.right = DecisionTree.Node()
        
        self._fit(X[condition], y[condition], node.left, depth + 1, 2 * v + 1)
        self._fit(X[condition ^ True], y[condition ^ True], node.right, depth + 1, 2 * v + 2)
        
    def fit(self, X, y):
        labels, y = np.unique(y, return_inverse=True)
        self.root = DecisionTree.Node()
        self._fit(X, y, self.root)
        self.root.apply_labels(labels)
    
    def predict(self, X):
        y = np.array([None] * len(X))
        for i, x in enumerate(X):
            y[i] = self.root.predict(x)
        return y
    
    def predict_probabilities(self, X):
        y = np.zeros((len(X), 2))
        for i, x in enumerate(X):
            y[i] = self.root.predict_probs(x)
        return y

In [24]:
def fit_tree(dataset_name, X, y, random_state=None):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)
    accuracy_train, accuracy_val = [], []
    for k in range(1, 11):
        tree = DecisionTree(k)
        tree.fit(X_train, y_train)
        
        y_predict_train = tree.predict(X_train)
        accuracy_train.append(sum(y_predict_train == y_train) / len(y_train))
        
        y_predict_val = tree.predict(X_val)
        accuracy_val.append(sum(y_predict_val == y_val) / len(y_val))
        
    trace = [
        go.Scatter(x=np.arange(1, 11), y=accuracy_train, mode='lines', name='train'),
        go.Scatter(x=np.arange(1, 11), y=accuracy_val, mode='lines', name='validation'),
    ]
    layout = go.Layout(title=dataset_name, xaxis=dict(title='depth'), yaxis=dict(title='accuracy'))
    figure = go.Figure(data=trace, layout=layout)
    py.iplot(figure)

In [25]:
fit_tree('Cancer Dataset', *load_cancer_dataset(), 13)
fit_tree('Spam Dataset', *load_spam_dataset(), 13)

In [26]:
def draw_roc_curve(fprs, tprs, feature_nums, dataset_name):
    traces = [go.Scatter(x=fpr, y=tpr, mode='lines', name=f'feature #{feature}', xaxis=f'x{i+1}', yaxis=f'y{i+1}') 
             for i, (fpr, tpr, feature) in enumerate(zip(fprs, tprs, feature_nums))]
    layout = go.Layout(
        title=f'ROC curves for the {dataset_name} dataset',
        width=950, 
        height=500,
        xaxis1=dict(domain=[0.0, 0.475]),
        xaxis2=dict(domain=[0.525, 1.0]),
        yaxis1=dict(domain=[0.0, 1.0], anchor='x1'),
        yaxis2=dict(domain=[0.0, 1.0], anchor='x2')
    )
    figure = go.Figure(data=traces, layout=layout)
    py.iplot(figure)


def draw_roc_curves_for_class_probs(dataset_name, X, y, depth, random_state=None):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)
    fprs, tprs, feature_nums = [], [], []
    
    tree = DecisionTree(depth)
    tree.fit(X_train, y_train)
    
    probs = tree.predict_probabilities(X_val)
    _, y_val = np.unique(y_val, return_inverse=True)
    for i in range(2):
        fpr, tpr, _ = roc_curve(y_val, probs.T[i])
        fprs.append(fpr)
        tprs.append(tpr)
        feature_nums.append(i)
    
    draw_roc_curve(fprs, tprs, feature_nums, dataset_name)

In [27]:
draw_roc_curves_for_class_probs('Cancer Dataset', *load_cancer_dataset(), 6, 13)
draw_roc_curves_for_class_probs('Spam Dataset', *load_spam_dataset(), 10, 13)