In [None]:
# Data Preprocessing.
import pandas as pd
import numpy as np

# Sets up directory structure.
!mkdir -p DATA/{PREPROCESSED/TEST,PREPROCESSED/TRAIN,ORIGINAL/.tmp,GENERATED}

# Save
import arff
def save_arff(df,filename):
    filename=filename.replace(".csv",".arff")
    arff.dump(filename, df.values, relation='Relation', names=df.columns)
    if len(df.Class.unique()) == 1: 
        print("=1 class:",df.Class.unique())
    else:
        print(">1 class:",df.Class.unique())
        
    _max = 10 if "MNIST" in filename else 2
    classes = "{" + ",".join([str(x) for x in range(_max)]) + "}"
    !sed -i "s/Class string/Class {classes}/g" {filename}
    
    
def save_data(data=None, save_file=None, attribute_names=None, num_classes=1):
    #invalid attribute names for weka
    if data.shape[1] ==36:attribute_names=["attr_"+str(i+1) for i in range (len(attribute_names))]
    attributes = data[:, :-num_classes]
    classes    = data[:, -num_classes:]

    if num_classes > 1:
        # MNIST class is the index of largest row.
        _class = np.argmax(classes,axis=1)
    else:
        # Other class are defined by the conditions x>=.5 -> 1 and x<.5 -> 0
        _class = np.around(classes)
    print(save_file,"mean:",np.mean(_class),"| std:",np.std(_class),end=" | ")
    # Last column must be Class as int.
    df=pd.DataFrame(attributes,columns=attribute_names)
#     print(_class)
    df["Class"]=_class.astype(int).astype(str)
    df.to_csv(save_file,index=False)
    save_arff(df, save_file)
    
    # If saving the MNIST_data also save the expanded class for training on the wgan.
    if num_classes > 1:
        df=pd.DataFrame(attributes,columns=attribute_names)
        for i in range(num_classes):
            df["Class_{}".format(i)]=classes[:,i]
        df.to_csv(save_file.replace(".csv","_for_DPWGAN.csv"),index=False)


# Splits data into 70% training set and 30% testing set.
def split_np_array(array): 
    return array[:int(.7*array.shape[0])], array[int(.7*array.shape[0]):]


# Stadardize each column independently to 0<x<1.
def standardize_columns(array):
    def _standardize_columns(x):
        if np.all(x==0.0): return x
        _max=np.amax(x)
        _min=np.amin(x)
        x = (x-_min)/(_max-_min)
        return x
    for i in range(array.shape[1]-1):
        array[:,i:i+1] = _standardize_columns(array[:,i:i+1])
    return array

    
def preprocess_MNIST_data():
    """
    Original Data downloaded through keras
    """
    from keras.datasets import mnist
    global train_mnist
    # load (downloaded if needed) the MNIST dataset
    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    Y = np.expand_dims(np.array(list(y_train) + list(y_test),dtype=int),-1)
    Y_expanded = np.zeros((len(Y),10),dtype=float)
    for i,x in enumerate(list(Y)): 
        Y_expanded[i][x]=1.0
    X = np.concatenate([X_train.flatten(),X_test.flatten()])
    X = np.reshape(X,(-1,28*28)).astype(float)/255
    data_mnist = np.concatenate([X,Y_expanded], axis=1)
    train_mnist,test_mnist = split_np_array(data_mnist)
    
    #crate lables for columns
    MNIST_attribute_names = []
    for i in range(28):
        for j in range(28):
            cell_name = "Cell_{}_{}".format(i,j)
            MNIST_attribute_names += [cell_name]
    
    # save data
    save_data(data=train_mnist, save_file="DATA/PREPROCESSED/TRAIN/MNIST_TRAIN.csv", 
              attribute_names=MNIST_attribute_names, num_classes=10)
    save_data(data=test_mnist,  save_file="DATA/PREPROCESSED/TEST/MNIST_TEST.csv",  
              attribute_names=MNIST_attribute_names, num_classes=10)
    print("Preprocessed MNIST data")
    

def preprocess_KCCR_data():
    """
    The file with original data is DATA/ORIGINAL/Kaggle_Cervical_Cancer_Risk.csv
    Note: First, used excel to replace some missing values in the "STDs: Time since first diagnosis" and "STDs: Time since last diagnosis"
    in pandas style psudo-code: 
    for row in KCCR:
        set KCCR_df["STDs: Time since first diagnosis"] = KCCR_df["Stds"] if KCCR_df["Stds"] == 0 
        set KCCR_df["STDs: Time since last diagnosis"] = KCCR_df["Stds"] if KCCR_df["Stds"] == 0
    The file with these changes is DATA/ORIGINAL/.tmp/Kaggle_Cervical_Cancer_Risk.csv
    """   
    global train_kccr
    # Replaces "?" with "" so that it loads into pandas with np.nan values.
    !cat DATA/ORIGINAL/.tmp/Kaggle_Cervical_Cancer_Risk.csv | sed s/\?//g >DATA/ORIGINAL/.tmp/Kaggle_Cervical_Cancer_Risk2.csv
    KCCR_df  = pd.read_csv("DATA/ORIGINAL/.tmp/Kaggle_Cervical_Cancer_Risk2.csv").astype(float)
    for x in list(KCCR_df): # For each column, replace nan values with the column average
        ave = KCCR_df[x].dropna().mean()
        KCCR_df[x]=KCCR_df[x].fillna(ave)
    KCCR_attribute_names  = list(KCCR_df)[:-1]
    
    
    data_kccr = KCCR_df.values.astype(float)
    data_kccr = standardize_columns(data_kccr)
    train_kccr, test_kccr  = split_np_array(data_kccr)
    print(len([x for x in data_kccr if x[-1]==1]),len([x for x in data_kccr]))
    # duplicate positive instances untill they compose 1/5 of data
    pos=[x for x in train_kccr if x[-1]==1]
    neg=[x for x in train_kccr if x[-1]==0]
    pos=pos*4
    train_kccr=np.asarray(neg+pos)
    np.random.shuffle(train_kccr)
    
    # save data
    save_data(data=train_kccr, save_file="DATA/PREPROCESSED/TRAIN/KCCR_TRAIN.csv", 
              attribute_names=KCCR_attribute_names, num_classes=1)
    save_data(data=test_kccr,  save_file="DATA/PREPROCESSED/TEST/KCCR_TEST.csv",  
              attribute_names=KCCR_attribute_names, num_classes=1)
    print("Preprocessed KCCR data")
    

def preprocess_KCCFD_data():
    """
    The file with original data is DATA/ORIGINAL/Kaggle_Credit_Card_Fraud_Detection.csv
    """
    global train_kccfd
    KCCFD_path = "DATA/ORIGINAL/Kaggle_Credit_Card_Fraud_Detection.csv"
    KCCFD_df = pd.read_csv(KCCFD_path)
    #cutoff time variable
    KCCFD_attribute_names = list(KCCFD_df)[1:-1]
    data_kccfd = KCCFD_df.values[:,1:]
    print("The number of positive values in the KCCFD dataset is:",
          len([x for x in data_kccfd if x[-1]==1]))
    
    data_kccfd = standardize_columns(data_kccfd)
    train_kccfd,test_kccfd = split_np_array(data_kccfd)
    print(len([x for x in data_kccfd if x[-1]==1]),len([x for x in data_kccfd]))
    
    # duplicate positive instances untill they compose 1/5 of data
    pos=[x for x in train_kccfd if x[-1]==1]
    neg=[x for x in train_kccfd if x[-1]==0]
    print("int(len(neg)/(4*len(pos)))",int(len(neg)/(4*len(pos))))
    pos=pos*int(len(neg)/(4*len(pos)))
    train_kccfd=np.asarray(neg+pos)
    np.random.shuffle(train_kccfd)
    # save data
    save_data(data=train_kccfd, save_file="DATA/PREPROCESSED/TRAIN/KCCFD_TRAIN.csv", 
              attribute_names=KCCFD_attribute_names, num_classes=1)
    save_data(data=test_kccfd,  save_file="DATA/PREPROCESSED/TEST/KCCFD_TEST.csv",  
              attribute_names=KCCFD_attribute_names, num_classes=1)
    print("Preprocessed KCCFD data")
    
    
def preprocess_data():
    preprocess_KCCR_data()
    preprocess_KCCFD_data()
    preprocess_MNIST_data()
preprocess_data()