<a href="https://colab.research.google.com/github/ChengyangHuang/Personalized_Regression/blob/main/Disease_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os

if not os.path.exists("/content/hcvdat0.csv"):
    !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.neural_network import MLPClassifier

from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

from scipy.spatial import KDTree
from scipy.spatial import distance_matrix

import torch

# np.random.seed(0)


def load_dataset(impute=True, binary=True):
    dataset = pd.read_csv("/content/hcvdat0.csv", usecols=range(1, 14))
    X = []
    sex_dict = {'m': 0,
                'f': 1}
    for name, data in dataset.items():
        if name == "Category":
            y = data.to_list()
            y = [x.split('=')[0] for x in y]
            y = np.array([int(x) if x.isdigit() else 5 for x in y])
            if binary:
                y = np.array([0 if x == 0 or x == 5 else 1 for x in y])
        elif name == "Sex":
            X.append([int(sex_dict[x]) for x in data])
        else:
            X.append([float(x) for x in data])

    X = np.array(X).T
    if impute == True:
        imputer = KNNImputer(n_neighbors=2)
        X = imputer.fit_transform(X)
    return X, y


def plot_parameters(theta_gt, theta_est):
    fig, ax = plt.subplots(1)
    ax.scatter(theta_gt[:, 0], theta_gt[:, 1], label="True Parameters")
    ax.scatter(theta_est[:, 0], theta_est[:, 1], label="EST. Parameters")
    ax.legend()
    return fig, ax


def print_metrics(dic):
    message = f"\n"
    for k, v in dic.items():
        message += f"{k}:\t{v}\n"
    print(message)


def evaluate_method(method, X_test, y_test, method_name):
    y_pred = method.predict(X_test)
    y_prob = method.predict_proba(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred)
    auroc = roc_auc_score(y_test, y_prob[:, 1])
    performance_dict = {"method_name": method_name,
                        "accuracy": accuracy,
                        "precision": precision,
                        "recall": recall,
                        "fscore": fscore,
                        "auroc": auroc}
    return performance_dict

In [7]:
class PR():
    def __init__(self, theta_pop, dataset, args):
        # Training Parameters
        self.sigma_theta = args.sigma_theta
        self.lambd = args.theta_regularizer
        self.gamma = args.distance_regularizer
        self.nu = args.phi_regularizer
        self.alpha = args.learning_rate
        self.c = args.lr_decay
        self.q = args.latent_dim
        self.log_steps = args.log_steps
        self.n_neighbors = args.n_neighbors
        self.show_logs = args.show_logs

        self.theta_pop = torch.from_numpy(theta_pop)
        self.p = self.theta_pop.shape[0]

        # Training Data
        X_train, y_train, theta_train, U_train = dataset
        self.X_train = torch.from_numpy(X_train)
        self.y_train = torch.from_numpy(y_train)
        self.U_train = torch.from_numpy(U_train)
        self.U_tree = KDTree(U_train) 
        self.U_distance_mat = torch.from_numpy(distance_matrix(U_train, U_train, p=2))
        self.theta_train = torch.from_numpy(theta_train)
        self.n = self.X_train.shape[0]
        self.k = self.U_train.shape[1]

        # Variable Initialization
        PI = np.random.multivariate_normal(theta_pop, self.sigma_theta*np.eye(self.p), size=self.n)
        self.PI = torch.from_numpy(PI)
        self.Z, self.Q = self.__init_ZnQ(PI)
        self.phi = torch.ones((self.k, ))

        # Loss Functions
        self.__sample_specific_loss = torch.nn.MSELoss(reduction='sum')
        self.__parameter_regularizer = torch.nn.L1Loss()
        self.__phi_regularizer = torch.nn.MSELoss()
        self.__distance_loss = torch.nn.MSELoss()


    def train(self, n_epoch=5):
        for epoch in range(n_epoch):
            Z = self.Z.clone().requires_grad_(True)
            Q = self.Q.clone().requires_grad_(True)
            phi = self.phi.clone().requires_grad_(True)
            PI = self.__update_PI(Z, Q) 

            y_pred = (self.X_train * PI).sum(dim=1, keepdim=True)

            # Calculate Loss
            l_loss = self.__sample_specific_loss(y_pred, self.y_train)
            # D_loss = self.__distance_matching_regularizer(Z, phi)
            D_loss = 0
            theta_loss = self.__parameter_regularizer(PI, torch.zeros_like(PI))
            phi_loss = self.__phi_regularizer(phi, torch.ones_like(phi))
            loss = l_loss + self.gamma * D_loss + self.lambd * theta_loss + self.nu * phi_loss
            
            # Update phi
            loss.backward()
            self.phi -= self.alpha * phi.grad

            # Update Z
            alpha_cust = self.alpha / torch.linalg.norm(self.PI-self.theta_pop, 
                                                        float('inf'), dim=1, keepdim=True)
            self.Z -= alpha_cust * Z.grad

            # Update Q
            self.Q -= self.alpha * Q.grad

            # Update alpha
            self.alpha *= self.c

            # Update theta, PI
            self.PI = self.__update_PI(self.Z, self.Q)

            if self.show_logs and epoch % self.log_steps == 0:
                message = f"Epoch {epoch+1} - Total loss: {loss}\tDistance loss:{D_loss}"
                print(message)
                plot_parameters(self.theta_train, self.PI)
                plt.show()
        

    def predict(self, X_test, U_test):
        X = torch.from_numpy(X_test)
        _, idx_sets = self.U_tree.query(U_test, k=self.n_neighbors)
        y_pred = []
        for i in range(X_test.shape[0]):
            idxs = idx_sets[i]
            theta = self.PI[idxs, :].mean(axis=0)
            y = torch.dot(X[i], theta).numpy()
            y_pred.append(y)  
        return y_pred


    def __init_ZnQ(self, PI):
        pca = PCA(n_components=self.q, whiten=False)
        Z = pca.fit_transform(PI)
        Q = pca.components_
        return torch.from_numpy(Z), torch.from_numpy(Q)
    

    def __update_PI(self, Z, Q):
        return torch.mm(Z, Q) + self.theta_pop
    

    def __distance_matching_regularizer(self, Z, phi):
        #TODO
        
        _Z = Z.detach().numpy()
        tree = KDTree(_Z)
        _, J_sets = tree.query(_Z, k=6)
        d_loss = 0
        for i in range(J_sets.shape[0]):
            J_set = J_sets[i][1:]
            dU = phi * self.U_distance_mat[i, J_set]
            dZ = torch.linalg.norm(Z[i] - Z[J_set, :], ord=2, dim=1)
            d_loss += self.__distance_loss(dZ, dU)
        return d_loss


In [8]:
class PR_Arguments():
    sigma_theta = 0.001
    learning_rate = 4e-3
    lr_decay = 1-1e-4
    latent_dim = 2
    n_epoch = 2000 #2000
    theta_regularizer = 0.01
    distance_regularizer = 0.01
    phi_regularizer = 0

    n_neighbors = 3
    log_steps = 100
    show_logs = False


def main():
    X, y = load_dataset()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    # dataset_train = (X_train, y_train, theta_train, U_train)

    dict_list = []
    ## Logistic Regression
    lr = LogisticRegression().fit(X_train, y_train)
    dict_lr = evaluate_method(lr, X_test, y_test, "Logistic Regression")
    dict_list.append(dict_lr)
    # theta_lr = np.array(lr.coef_)
    # print(theta_lr)
    # plot_parameters(theta_train, theta_lr)

    # ## Other Baseline methods
    # DNN
    dnn = MLPClassifier(hidden_layer_sizes=(50,)).fit(X_train, y_train.squeeze())
    dict_dnn = evaluate_method(dnn, X_test, y_test, "Deep Neural Networks")
    dict_list.append(dict_dnn)
    theta_dnn = np.array(dnn.coefs_[0].T)
    # print(theta_dnn.shape)
    # plot_parameters(theta_train, theta_dnn)

    # ## Personalized Regression
    # pr_args = PR_Arguments()
    # pr = PR(theta_lr.squeeze(), dataset_train, pr_args)
    # pr.train(pr_args.n_epoch)
    # y_pred_pr = pr.predict(X_test, U_test)
    # mse_pr = mean_squared_error(y_test, y_pred_pr)
    # plot_parameters(theta_train, pr.PI)
    

    for x in dict_list:
        print_metrics(x)


if __name__ == "__main__":
    main()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



method_name:	Logistic Regression
accuracy:	0.9113300492610837
precision:	[0.91847826 0.84210526]
recall:	[0.98255814 0.51612903]
fscore:	[0.9494382 0.64     ]
auroc:	0.9253563390847712


method_name:	Deep Neural Networks
accuracy:	0.9507389162561576
precision:	[0.96022727 0.88888889]
recall:	[0.98255814 0.77419355]
fscore:	[0.97126437 0.82758621]
auroc:	0.9351087771942985

