In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss
from e2ml.experimentation import perform_bayesian_optimization
from e2ml.preprocessing import PrincipalComponentAnalysis
from sklearn.decomposition import PCA
from e2ml.experimentation import acquisition_ei, acquisition_ucb
from e2ml.models import GaussianProcessRegression

In [13]:
def loadFullData() -> pd.DataFrame:
    initial_molluscs_data = pd.read_csv('../data/initial_molluscs_data.csv')
    first_batch = pd.read_csv("../data/batch1_labels.csv")
    full_data = pd.concat((initial_molluscs_data, first_batch))
    return full_data

def getOneHotEncoding(data, values=None):
    enc = np.zeros((len(data), len(values)))
    for i, x in enumerate(data):
        enc[i, np.where(values == x)[0][0]] = 1
    return enc

def reverse_one_hot(data, values):
    enc = []
    for x in data:
        if(x[0] == 1.):
            enc.append([values[0]])
        elif(x[1] == 1.):
            enc.append([values[1]])
        elif(x[2] == 1.):
            enc.append([values[2]])    
    return np.array(enc)

def reverse_one_hot_index(indices, values):
    enc = []
    for x in indices:
        print(x)
        enc.append([values[x]])    
    return np.array(enc)

def softmax(x):
    x = np.array(x)
    return np.exp(x) / np.exp(x).sum(axis=1).reshape(x.shape[0],-1)

def score_cross_entropy_loss(mdl, x, y):
    if(len(y.shape) == 1):
        values = np.sort(np.unique(y))
        y = getOneHotEncoding(y, values)
    y_pred = softmax(mdl.predict_proba(x))
    #return cross_entropy_loss(y, y_pred)
    return [log_loss(y[i], y_pred[i])*-1 for i in range(len(y_pred))]

def objectiveFunctionRFC(x, y_rfc):
    rfc = RandomForestClassifier()
    rfc.fit(x,y_rfc)
    return score_cross_entropy_loss(rfc, x, y_rfc)

def objectiveFunctionSVC(x,y_rfc):
    svc = SVC(kernel="rbf", probability=True)
    svc.fit(x, y_rfc)
    return score_cross_entropy_loss(svc, x, y_rfc)

def objectiveFunctionMLP(x, y):
    mlp = MLPClassifier(max_iter=1000)
    mlp.fit(x,y)
    return score_cross_entropy_loss(mlp, x, y)

full_data = loadFullData()
y = full_data["Stage of Life"]
#print(full_data)


values = np.sort(np.unique(full_data["Sex"]))

x = np.concatenate((getOneHotEncoding(full_data["Sex"], values), full_data.values[:,1:-1]), axis=1)
print(x.shape)
#print(x_full_replaced)

y_replaced = y.replace("Adult",0).replace("Adole",1).replace("Child",2)

pca = PCA(2)
pca = pca.fit(x)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

x_pca = pca.transform(x)
print(x_pca[:,0].max())
print(x_pca[:,1].max())
print(x_pca[:,0].min())
print(x_pca[:,1].min())

x_acquired = x_pca
y_acquired_rfc = objectiveFunctionRFC(x_acquired, y_replaced)
#print(y_acquired_rfc)
y_acquired_svc = objectiveFunctionSVC(x_acquired, y_replaced)
#print(y_acquired_svc)
y_acquired_mlp = objectiveFunctionMLP(x_acquired, y)
#print(y_acquired_mlp)

metrics_dict = {'gamma': 50, 'metric': 'rbf'}
gpr = GaussianProcessRegression(metrics_dict=metrics_dict)
gpr.fit(x_acquired, y_acquired_mlp)

x1_new = np.linspace(-1, 1.5, 150)
x_mesh,y_mesh = np.meshgrid(x1_new, x1_new)

x_cand = np.stack((x_mesh, y_mesh), axis=2).reshape(-1,2)
print(x_cand.shape)
means, stds = gpr.predict(x_cand, True)
scores = acquisition_ei(means, stds, max(y_acquired_mlp))
print(scores)
print(scores.mean())
print(scores.std())
print(scores.min())
print(scores.max())

nextidx = np.argsort(scores)
print("scores")
print(scores[nextidx[:282]])
print("cands")

x_new_acquired = x_cand[nextidx[:282]]
used_dict = {"used_x": x_new_acquired[:,0], "used_y": x_new_acquired[:,1]}
print(x_new_acquired)
used_df = pd.DataFrame(x_new_acquired)
#used_df.to_csv("used_cands.csv")
print("inv pca")
new_experiments = pca.inverse_transform(x_new_acquired)
print(new_experiments[:,:3])
print(new_experiments[:,:3].shape)
print(reverse_one_hot(new_experiments[:,:3], values))
print(reverse_one_hot(new_experiments[:,:3], values).shape)

print(new_experiments[:,3:])
print(new_experiments[:,3:].shape)
new_experiments = np.concatenate((reverse_one_hot_index(np.argmax(new_experiments[:,:3], axis=1), values), new_experiments[:,3:]), axis=1)
print(new_experiments.shape)
print(new_experiments)
new_expirments_df = pd.DataFrame(data=new_experiments, columns=full_data.columns[:-1])
print(new_expirments_df)
new_expirments_df.to_csv("SecondBatch.csv")
print()




(298, 10)
[0.53232334 0.24866738]
[11.12025736  7.60040082]
0.9225504911256364
1.078407404814835
-0.8330277335425136
-0.4116406831446476




(22500, 2)
[0.60224686 0.60224686 0.60224686 ... 0.60224686 0.60224686 0.60224686]
0.6317717119777465
0.4024299767344086
8.464606315375856e-73
6.029615630044231
scores
[8.46460632e-73 4.63409606e-68 4.94816537e-66 1.13235145e-65
 3.50213773e-65 1.00803398e-58 2.98848635e-58 3.45956615e-56
 2.04336271e-55 4.87375512e-55 4.31610265e-53 2.17708171e-52
 4.93567109e-51 3.54690783e-49 4.13765673e-48 6.14290096e-47
 9.87940239e-47 1.36014414e-43 6.57194116e-43 2.21724527e-42
 3.53386336e-41 6.36228634e-41 6.67757877e-41 1.25979534e-40
 3.23848164e-39 8.63045890e-39 1.20758807e-38 4.88664458e-38
 3.37938857e-37 1.37441969e-36 4.15180562e-35 4.19451401e-35
 6.01066353e-35 7.27232695e-33 1.15319324e-32 1.10093216e-31
 1.56805216e-31 1.72457228e-31 9.08557482e-31 1.62564447e-30
 1.69927613e-30 2.09795259e-30 6.67102539e-30 2.16467597e-29
 5.33455323e-29 7.10698889e-29 1.29548001e-28 4.28228054e-28
 6.10577181e-28 2.59209638e-27 3.29153113e-27 7.84126404e-27
 8.67062547e-27 1.21215297e-25 2.155054