In [2]:
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
# from sklearn.manifold import TSNE

from xgboost import XGBClassifier

import pandas as pd
import numpy as np
from hmeasure import h_score
import matplotlib.pyplot as plt
from MulticoreTSNE import MulticoreTSNE as TSNE #need to install additional packages for this, otherwise use sklearn.manifold.TSNE
from scipy.io import arff
from rich import print
import joblib
import scipy
import json
import random
import time

from ks_metric import ks_score


In [100]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def preprocessing(data, dataset_name, seed):
    data = data.copy()
    data.drop_duplicates(inplace=True)

    if dataset_name == "polish":
        # impute 
        imr = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
        imr = imr.fit(data)
        data = imr.transform(data)
        imputed_data=imr.transform(data)
        imputed_data_df=pd.DataFrame(imputed_data)
        data=pd.DataFrame(imputed_data_df.values,columns=["Attr1","Attr2","Attr3","Attr4","Attr5","Attr6","Attr7","Attr8","Attr9","Attr10","Attr11","Attr12","Attr13","Attr14","Attr15","Attr16","Attr17","Attr18","Attr19","Attr20","Attr21","Attr22","Attr23","Attr24","Attr25","Attr26","Attr27","Attr28","Attr29","Attr30","Attr31","Attr32","Attr33","Attr34","Attr35","Attr36","Attr37","Attr38","Attr39","Attr40","Attr41","Attr42","Attr43","Attr44","Attr45","Attr46","Attr47","Attr48","Attr49","Attr50","Attr51","Attr52","Attr53","Attr54","Attr55","Attr56","Attr57","Attr58","Attr59","Attr60","Attr61","Attr62","Attr63","Attr64","label"])
        # convert class to int
        data['label'] = data['label'].astype(int)

    if dataset_name == "norwegian":
        data.pop('v22')
        data.pop('year')
        data.pop('org_number')

    df_train, df_test = train_test_split(data, test_size=0.2, random_state=seed)

    return df_train, df_test


def load_data(dataset, seed):
    dataset_name = dataset["name"]
    data = pd.read_parquet(f"../data/{dataset_name}_dataset.parquet")
    if dataset_name == "polish":
        data.rename(columns={'Class': 'label'}, inplace=True)

    # preprocessing
    df_train, df_test = preprocessing(data, dataset_name, seed)

    return df_train, df_test


def sampling(df_train_raw, sampling_technique, seed):
    df_train = df_train_raw.copy()
    if sampling_technique == "RandomUnderSampler":
        rus = RandomUnderSampler(random_state=seed)
        X_train, y_train = rus.fit_resample(df_train.drop('label', axis=1), df_train['label'])
        df_train = pd.concat([X_train, y_train], axis=1)
        return df_train
    
    elif sampling_technique == "SMOTE":
        sm = SMOTE(random_state=seed, n_jobs=5)
        X_train, y_train = sm.fit_resample(df_train.drop('label', axis=1), df_train['label'])
        df_train = pd.concat([X_train, y_train], axis=1)
        return df_train

    elif sampling_technique == "Raw":
        return df_train
    

# transform data
def transform_data(df_train, df_test, batch_size = 32):
    y_train = df_train.pop('label')
    X_train = df_train

    y_test = df_test.pop('label')
    X_test = df_test

    # Scale the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    y_train = y_train.tolist()
    y_test = y_test.tolist()

    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    return X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, X_train, y_train, X_test, y_test


# define VAE architecture
class VAE(nn.Module):
    def __init__(self, input_size, hidden_size, latent_size):
        super().__init__()
        
        # Encoder
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2_mean = nn.Linear(hidden_size, latent_size)
        self.fc2_logvar = nn.Linear(hidden_size, latent_size)
        
        # Decoder
        self.fc3 = nn.Linear(latent_size, hidden_size)
        self.fc4_mean = nn.Linear(hidden_size, input_size)
        self.fc4_logvar = nn.Linear(hidden_size, input_size)
    
    def encode(self, x):
        h1 = F.tanh(self.fc1(x))
        return self.fc2_mean(h1), self.fc2_logvar(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        h3 = F.tanh(self.fc3(z))
        return self.fc4_mean(h3), self.fc4_logvar(h3)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

    
def vae_loss(recon_x_mean, recon_x_logvar, x, mu, logvar):
    # Reconstruction loss
    k = recon_x_mean.size(1)
    variance = recon_x_logvar.exp()
    logp = (-k / 2.0) * torch.log(2 * torch.tensor(np.pi)) - 0.5 * torch.mean(recon_x_logvar) - torch.mean(0.5 * (1.0 / variance) * torch.square(x - recon_x_mean))
    recon_loss = torch.mean(-logp)  # negative log likelihood

    # Kullback-Leibler divergence loss
    KLD = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())

    # Total loss
    return recon_loss + KLD

def train(model, train_loader, optimizer, device, epoch, average_losses):
    model.train()
    train_loss = 0
    for batch_idx, (data, _) in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        recon_mu, recon_logvar = recon_batch
        loss = vae_loss(recon_mu, recon_logvar, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    average_loss = train_loss / len(train_loader.dataset)

    average_losses.append(average_loss)

    return average_losses

def get_latent_space(model, X_train_tensor, X_test_tensor, device):
    model.eval()
    with torch.no_grad():
        latent_space_train = model.encode(X_train_tensor.to(device))[0].detach().cpu().numpy()
        latent_space_test = model.encode(X_test_tensor.to(device))[0].detach().cpu().numpy()
    return latent_space_train, latent_space_test


def classify(X_train, y_train, X_test, classifier_name, hidden_size, seed):
    if classifier_name == "RandomForest":
        cf = RandomForestClassifier(random_state=seed)
        param_grid = {
            'n_estimators': [100,300,500],
            'max_features': ['auto', 'sqrt','log2'],
            'max_depth' : [4,6,8],
            'criterion' :['entropy', 'gini'],
        }
        best_cf = GridSearchCV(cf, param_grid=param_grid, cv=3, verbose=True)
    
    elif classifier_name == "XGBoost":
            cf = XGBClassifier(random_state=seed)
            param_grid = {
                'n_estimators': [100, 300, 500],
                'max_depth' : [4, 6, 8],
                'learning_rate': [0.1, 0.05, 0.01],
            }
            best_cf = GridSearchCV(cf, param_grid=param_grid, cv=3, verbose=True)
    
    elif classifier_name == "LogisticRegression":
        best_cf = LogisticRegression(random_state=seed, max_iter=500)

    elif classifier_name == "MLP":
        best_cf = MLPClassifier(hidden_layer_sizes=(hidden_size, hidden_size, hidden_size), max_iter=500, alpha=0.0001, random_state=seed, activation='relu', solver='adam',)

    # fit model
    best_cf.fit(X_train, y_train)

    # use best model
    if classifier_name == "RandomForest" or classifier_name == "XGBoost":
        best_cf = best_cf.best_estimator_
        
    # predict on test set
    y_pred_prob = best_cf.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_prob > 0.5).astype(int)

    return best_cf, y_pred, y_pred_prob


def calculate_metrics(y_test, y_pred, y_pred_prob):
    result = {}
    # calculate metrics
    result['accuracy_score'] = accuracy_score(y_test, y_pred)
    result['precision_score'] = precision_score(y_test, y_pred)
    result['recall_score'] = recall_score(y_test, y_pred)
    result['f1_score'] = f1_score(y_test, y_pred)
    result['auc_score'] = roc_auc_score(y_test, y_pred_prob)
    result['h_measure'] = h_score(np.array(y_test), np.array(y_pred_prob))
    result['ks_score'] = ks_score(y_test, y_pred_prob)/100
    return result


In [259]:
datasets = []

# taiwanse dataset configs
taiwanse_dataset = {"name":"taiwanese",
                    "latent_sizes": [30,45,60],
                    "hidden_size": 70,
                    }
datasets.append(taiwanse_dataset)

# norwegian dataset configs
norwegian_dataset = {"name":"norwegian",
                    "latent_sizes": [20,25,30],
                    "hidden_size": 45,
                    }
datasets.append(norwegian_dataset)

# polish dataset configs
polish_dataset = {"name":"polish",
                  "latent_sizes": [20,30,40],
                  "hidden_size": 50,}
datasets.append(polish_dataset)

(6819, 95)

In [None]:
results = []
seeds = [
    1,
    2,
    3,
    4
    ]
for seed in seeds: # choose seed
    set_seed(seed)
    for dataset in datasets: # choose dataset
        dataset_name = dataset["name"]
        hidden_size = dataset["hidden_size"]
        # sampling
        sampling_techniques = [
            "SMOTE",
            "RandomUnderSampler",
            "Raw"
            ]
        for sampling_technique in sampling_techniques: # choose sampling technique
            # load data
            df_train_raw, df_test = load_data(dataset, seed)
            # sampling
            df_train_raw = sampling(df_train_raw, sampling_technique, seed)
            # transform data
            X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, X_train, y_train, X_test, y_test = transform_data(df_train_raw, df_test)

            # create dataloader
            train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
            train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)    
        
            # get latent sizes
            latent_sizes = dataset["latent_sizes"]
            for latent_size in latent_sizes: # choose latent size
                # train VAE
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                model = VAE(input_size=X_train.shape[1], hidden_size=hidden_size, latent_size=latent_size).to(device)
                optimizer = optim.Adagrad(model.parameters(), lr=0.01, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10)
                epochs = 50
                average_losses = []
                for epoch in range(1, epochs + 1):
                    average_losses = train(model, train_loader, optimizer, device, epoch, average_losses)
                
                # save model
                torch.save(model.state_dict(), f"models/{dataset_name}_{sampling_technique}_{latent_size}_{seed}.pt")

                # get latent representation
                latent_space_train, latent_space_test = get_latent_space(model, X_train_tensor, X_test_tensor, device)
                # TSNE transformation
                tsne = TSNE(n_components=2, random_state=42, n_jobs=5)
                latent_space_train_tsne = tsne.fit_transform(latent_space_train)
                latent_space_test_tsne = tsne.fit_transform(latent_space_test)

                # classification
                classifiers = ["LogisticRegression", "RandomForest", "XGBoost", "MLP"]
                for classifier in classifiers:
                    # classification using 'raw' data
                    best_cf, y_pred, y_pred_prob = classify(X_train, y_train, X_test, classifier, hidden_size, seed)
                    # save model
                    joblib.dump(best_cf, f"models/{dataset_name}_{sampling_technique}_{latent_size}_{classifier}_{seed}.pkl")
                    # make and save histogram of predictions
                    plt.hist(y_pred_prob, bins=20)
                    plt.savefig(f"plots/{dataset_name}_{sampling_technique}_{latent_size}_{classifier}_{seed}.png")
                    plt.close()

                    # calculate metrics
                    result = calculate_metrics(y_test, y_pred, y_pred_prob)
                    config = {
                        'seed': seed,
                        'dataset': dataset_name,
                        'sampling_technique': sampling_technique,
                        'latent_size': latent_size,
                        'hidden_size': hidden_size,
                        'classifier': classifier,
                    }
                    output_data = {
                        'y_pred': y_pred,
                        'y_pred_prob': y_pred_prob,
                        'y_test': np.array(y_test),
                    }
                    # append results
                    results.append({
                        'config': config,
                        'metrics': result,
                        'data': output_data,
                    })
    
                    # classification using latent space
                    best_cf, y_pred, y_pred_prob = classify(latent_space_train, y_train, latent_space_test, classifier, hidden_size, seed)
                    # save model
                    joblib.dump(best_cf, f"models/{dataset_name}_{sampling_technique}_{latent_size}_{classifier}_{seed}.pkl")
                    # make and save histogram of predictions
                    plt.hist(y_pred_prob, bins=20)
                    plt.savefig(f"plots/{dataset_name}_{sampling_technique}_{latent_size}_{classifier}_{seed}.png")
                    plt.close()

                    # calculate metrics
                    result = calculate_metrics(y_test, y_pred, y_pred_prob)
                    config = {
                        'seed': seed,
                        'dataset': dataset_name,
                        'sampling_technique': sampling_technique,
                        'latent_size': latent_size,
                        'hidden_size': hidden_size,
                        'classifier': classifier,
                    }
                    output_data = {
                        'y_pred': y_pred,
                        'y_pred_prob': y_pred_prob,
                        'y_test': y_test,
                    }
                    vae_data = {
                        'latent_space_train': latent_space_train,
                        'latent_space_test': latent_space_test,
                        'average_losses': average_losses,
                    }
                    # append results
                    results.append({
                        'config': config,
                        'metrics': result,
                        'data': output_data,
                        'latent_space_data': vae_data,
                    })

                    # plot latent representations using y_test as color
                    plt.scatter(latent_space_test_tsne[:, 0], latent_space_test_tsne[:, 1], c=y_test, cmap='viridis',s=5)
                    plt.xlabel('t-SNE Dimension 1')
                    plt.ylabel('t-SNE Dimension 2')
                    plt.title(f'{dataset_name} - {sampling_technique} - {latent_size} - {classifier} - {seed}_true')
                    plt.savefig(f"plots/{dataset_name}_{sampling_technique}_{latent_size}_{classifier}_{seed}_true.png")
                    plt.close()
                    
                    # plot latent representations using y_pred_prob as color
                    plt.scatter(latent_space_test_tsne[:, 0], latent_space_test_tsne[:, 1], c=y_pred_prob, cmap='viridis',s=5)
                    plt.xlabel('t-SNE Dimension 1')
                    plt.ylabel('t-SNE Dimension 2')
                    plt.title(f'{dataset_name} - {sampling_technique} - {latent_size} - {classifier} - {seed}_pred')
                    plt.savefig(f"plots/{dataset_name}_{sampling_technique}_{latent_size}_{classifier}_{seed}_pred.png")
                    plt.close()

                    # print confirmation message
                    print(f"{dataset_name} - {sampling_technique} - {latent_size} - {classifier} - {seed} - DONE")    

# convert numpy arrays to lists
for result in results:
    # check if array
    if isinstance(result['data']['y_pred'], np.ndarray):
        result['data']['y_pred'] = result['data']['y_pred'].tolist()
    if isinstance(result['data']['y_pred_prob'], np.ndarray):
        result['data']['y_pred_prob'] = result['data']['y_pred_prob'].tolist()
    if isinstance(result['data']['y_test'], np.ndarray):
        result['data']['y_test'] = result['data']['y_test'].tolist()     
    if isinstance(result['latent_space_data']['latent_space_train'], np.ndarray):
        result['latent_space_data']['latent_space_train'] = result['latent_space_data']['latent_space_train'].tolist()
    if isinstance(result['latent_space_data']['latent_space_test'], np.ndarray):
        result['latent_space_data']['latent_space_test'] = result['latent_space_data']['latent_space_test'].tolist()           

# save results
with open('results/results.json', 'w') as f:
    json.dump(results, f)

In [None]:
with open('results/results.json') as f:
    data = json.load(f)

# select results for norwegian dataset
results = []
for result in data:
    if result['config']['dataset'] == 'norwegian':
        results.append(result)


def calculate_average_metric(results, metric, latent_size, classifier, sampling_technique,vae = False):
    data = []
    if vae:
        for result in results:
            if 'latent_space_data' in result.keys():
                if result['config']['classifier'] == classifier and result['config']['sampling_technique'] == sampling_technique and result['config']['latent_size'] == latent_size:
                    data.append(result['metrics'][f'{metric}'])
    else:
        for result in results:
            if 'latent_space_data' not in result.keys():
                if result['config']['classifier'] == classifier and result['config']['sampling_technique'] == sampling_technique:
                    data.append(result['metrics'][f'{metric}'])

    mean = np.mean(data)
    std = np.std(data)
    upper = mean + std
    lower = mean - std

    return round(mean, 4), round(lower,4), round(upper,4)

In [None]:
# To calculate average metrics for train embeddings
for i in [20,25,30]: # adjust latent size depending on the current dataset
    print(calculate_average_metric(results,
    'ks_score', # Metric to calculate
    i,
    'LogisticRegression', # Classifier 
    'SMOTE', # Sampling technique
    vae = True
    ))


In [None]:
# To calculate average metrics for raw train data
for i in ['LogisticRegression','RandomForest','XGBoost','MLP']:
    print(calculate_average_metric(results,
    'ks_score', # Metric to calculate
    '', # leave empty
    i, # Classifier
    'SMOTE', # Sampling technique
    vae = False
    ))