# VIC Clustering Validation 

## Libraries

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
rs = 123
import time
#Preprocessing
from sklearn import preprocessing
import category_encoders as ce
import glob
#Sci-kit libraries
from sklearn import metrics
from sklearn import utils
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold


## VIC Implementation for 3 clusters

In [43]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

In [119]:
def VIC_3_classes(folder_path, classifier, k = 10):
    auc_scores = []
    for file in glob.glob(folder_path):
        if file =='csv_partitions\sc_c3_th0_0.csv': #3
        #if file == 'csv_partitions\sc_c2_th0.csv': #2
            print(file)
            # read as pandas data frame
            db = pd.read_csv(file)
            #missing values
            db = db.replace([np.inf, -np.inf], np.nan)
            db = db.fillna(0)
            # define X and y
            y = db.iloc[:,-1:].values
            X = db.iloc[:,2:-2].values
            # encode categorical variables
            labelencoder = preprocessing.LabelEncoder()
            X[:,0] = labelencoder.fit_transform(X[:,0])
            # Binarize the output with 3 classes
            y = label_binarize(y, classes=[0,1,2])
            #Pipeline - Scale and Evaluate with CV
            kf = model_selection.KFold(n_splits = k)
            for train_index, test_index in kf.split(X):
                #split train-test
                X_train,X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                # scale data
                min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
                X_train = min_max_scaler.fit_transform(X_train)
                X_test  = min_max_scaler.transform(X_test)
                #fit model
                model = OneVsRestClassifier(classifier)
                y_score = model.fit(X_train,y_train).decision_function(X_test)
                #append result to auc_scores
                # Compute ROC curve and ROC area for each class
                fpr = dict()
                tpr = dict()
                roc_auc = dict()
                for i in range(3):
                    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
                    roc_auc[i] = auc(fpr[i], tpr[i])
                # Compute micro-average ROC curve and ROC area
                fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
                roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
                print(roc_auc)
                auc_scores.append(roc_auc['micro'])
            avg_auc = sum(auc_scores)/len(auc_scores)
    return avg_auc, auc_scores

In [121]:
## Datasets 
folder = 'csv_partitions/*.csv'

In [122]:
avg_auc, auc_scores = VIC_3_classes(folder, LogisticRegression())

csv_partitions\sc_c3_th0_0.csv
{0: 0.7948899472261552, 1: 0.5624495999007505, 2: 0.6320242656449553, 'micro': 0.7279238095238095}
{0: 0.5578147823546596, 1: 0.6040476081622701, 2: 0.6358654240665846, 'micro': 0.7665254792844239}
{0: 0.7209574053747831, 1: 0.5860159173249391, 2: 0.6345432893750276, 'micro': 0.7266512732358253}
{0: 0.6678642896125159, 1: 0.531700937950938, 2: 0.6400413728803682, 'micro': 0.7326204766621991}
{0: 0.7838402436001841, 1: 0.580642589492147, 2: 0.66700298297947, 'micro': 0.7509633034205466}
{0: 0.7347213168187745, 1: 0.6035007495116818, 2: 0.643852978453739, 'micro': 0.7486506468154536}
{0: 0.65205288796103, 1: 0.5808723843206602, 2: 0.6071977177967962, 'micro': 0.721084508478527}
{0: 0.7198951952696244, 1: 0.5653589328164983, 2: 0.6384074463118581, 'micro': 0.7270063661791272}
{0: 0.6951155462184875, 1: 0.5661090953815879, 2: 0.6645255474452555, 'micro': 0.7519120389254705}
{0: 0.7450105042016807, 1: 0.5835657355339154, 2: 0.6460091843428821, 'micro': 0.75671