In [None]:
%load_ext autoreload
%autoreload 2


import json
import os
import matplotlib.pyplot as plt
import numpy as np
from copy import deepcopy
import random
from math import floor

from ScheduleDistributionSampler import ScheduleDistributionSampler, activity_map, start_times, KLdivergence


### Original data from AMT

In [None]:
with open('data/personaBasedSchedules/corrected_histograms.json') as f:
    corrected_histograms = json.load(f)

activities = list(set(activity_map.values()))
activities.remove(None)

fig, ind_plot = plt.subplots(len(corrected_histograms.keys()), len(activities), sharex=True, sharey=True)
fig.set_size_inches(80, 50)

for i,(id,act_hists) in enumerate(corrected_histograms.items()):
    for j,(activity,data) in enumerate(act_hists.items()):
        ind_plot[i][j].bar(start_times, data)
        ind_plot[i][j].set_yticks([])
        ind_plot[i][j].set_xticks([])
        if i==0:
            ind_plot[i][j].set_title(activity.replace('_','\n'))
    ind_plot[i][0].set_ylabel(id, rotation=0, labelpad=100)

In [None]:
def plot_together(list_of_freqs, ax):
    for i,freqs in enumerate(list_of_freqs):
        freq_exist = np.array(start_times)[np.array(freqs) == 1]
        ax.plot(freq_exist,freq_exist*0 + i, '.')
        if sum(freqs) == 0:
            ax.plot(np.array(start_times),np.array(start_times)*0 + i, linewidth = 0.3, color=[0.8,0.8,0.8])

### Clustering individuals

In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans

individual_features = {}

morning = np.array([1 if i<12 else 0 for i in range(6,24)])
afternoon = np.array([1 if i>=12 and i<18 else 0 for i in range(6,24)])
evening = np.array([1 if i>=18 else 0 for i in range(6,24)])

for act in activities:
    individual_features[act] = {}
    for indiv, act_hist in corrected_histograms.items():
        feat = []
        seq = np.argwhere(act_hist[act]).reshape(-1)
        if len(seq) > 1:
            gm = GaussianMixture(n_components=2).fit(seq.reshape(-1,1))
            gm_means = gm.means_.reshape(-1)
            gm_inds = gm_means.argsort()[::-1]
            gm_means = deepcopy(gm_means[gm_inds]).reshape(-1)
        elif len(seq) == 1:
            gm_means = np.ones((2,)) * -1
            gm_means[0] = deepcopy(seq[0])
            gm_means[1] = deepcopy(seq[0])
        else:
            gm_means = np.ones((2,)) * -24
        this_act = np.array(act_hist[act])
        feat += [sum(morning*this_act), sum(afternoon*this_act), sum(evening*this_act)]
        feat += list(gm_means)
        individual_features[act][indiv] = np.array(feat)

In [None]:
cluster_histograms = {}
n_clusters = 4

if not os.path.exists('data/personaBasedSchedules/histograms/'):
    os.makedirs('data/personaBasedSchedules/histograms/')

for activity in activities:
    individual_names = list(individual_features[activity].keys())
    individual_features_array = np.array(np.array(list(individual_features[activity].values())))
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(individual_features_array)
    clusters = []
    for n in range(n_clusters):
        clusters.append([name for i,name in enumerate(individual_names) if kmeans.labels_[i]==n])

    clusters_used = []
    cluster_histograms[activity] = []
    for i, cluster in enumerate(clusters):
        avg = np.array([corrected_histograms[ind][activity] for ind in cluster]).mean(axis=0)
        if len(cluster) > 3:
            cluster_histograms[activity].append(list(avg))
            clusters_used.append(cluster)
            
    fig, axs = plt.subplots(2, len(cluster_histograms[activity]), sharex=True)
    axs = axs.reshape(2,-1)
    fig.set_size_inches(20, 5)
    fig.suptitle(f'{activity}')
    for i,(avg,cluster) in enumerate(zip(cluster_histograms[activity],clusters_used)):
        axs[1][i].plot(start_times, avg, '-.k')
        axs[1][i].set_ylim([0,1])
        plot_together([corrected_histograms[ind][activity] for ind in cluster], axs[0][i])
        axs[0][i].set_title(f'{len(cluster)}/21')
    fig.tight_layout()
    plt.savefig('data/personaBasedSchedules/histograms/'+activity+'.jpg')
    

with open('data/personaBasedSchedules/cluster_histograms.json','w') as f:
    json.dump(cluster_histograms, f, indent=4)


### Optimize for distinct persona

In [None]:
with open('data/personaBasedSchedules/cluster_histograms.json') as f:
    cluster_histograms = json.load(f)
activities = list(cluster_histograms.keys())

In [None]:
P = 5
N = 20
K = 10

In [None]:
def get_histogram(persona):
    return {act:cluster_histograms[act][choice] for act,choice in zip(activities,persona)}
    
def get_random_persona():
    persona = []
    for act in activities:
        activity_options = np.arange(len(cluster_histograms[act]))
        persona.append(random.choice(activity_options))
    return persona

def get_candidate():
    return [get_random_persona() for _ in range(P)]

def get_initial_pool():
    return [get_candidate() for _ in range(N)]

def valid(persona):
    leave_idx = persona[activities.index('leave_home')]
    come_idx = persona[activities.index('come_home')]
    leave_sum = np.sum(cluster_histograms['leave_home'][leave_idx])
    come_sum = np.sum(cluster_histograms['come_home'][come_idx])
    if leave_sum == 0 and come_sum == 0:
        return 1
    if (leave_sum == 0 and come_sum > 0) or (leave_sum > 0 and come_sum == 0):
        return 0
    leave_mean_cumul = np.cumsum(cluster_histograms['leave_home'][leave_idx])
    come_mean_cumul = np.cumsum(cluster_histograms['come_home'][come_idx])
    leave_mean = int(max(np.argwhere(leave_mean_cumul<0.5)))
    come_mean = int(max(np.argwhere(come_mean_cumul<0.5)))
    return float(come_mean > leave_mean)

def fittness_matrix(candidate):
    kl_mat = np.zeros((len(candidate), len(candidate)))
    for i1,per1 in enumerate(candidate):
        for i2,per2 in enumerate(candidate):
            if i1==i2:
                kl_mat[i1][i2] = 0 # personaFitness(get_histogram(per1))
            else:
                kl_mat[i1][i2] = KLdivergence(get_histogram(per1), get_histogram(per2))
    return kl_mat

def fitness(candidate):
    kl_mat = fittness_matrix(candidate)
    v = 1
    for per in candidate:
        v *= valid(per)
    return np.mean(kl_mat) * v

def get_pool_fitness(pool):
    return np.array([fitness(candidate) for candidate in pool])

def get_best_k(pool, k=K):
    pool_fitness = get_pool_fitness(pool)
    best_k_idx = pool_fitness.argsort()[::-1][:k]
    return [pool[idx] for idx in best_k_idx], pool_fitness[best_k_idx]

def mate(parents):
    random.shuffle(parents)
    children = [candidate[0:floor(P/2)] for candidate in parents]
    random.shuffle(parents)
    children = [prev+candidate[floor(P/2):] for candidate, prev in zip(parents,children)]
    return children

def mutate(pool):
    for candidate in pool:
        if random.random() < 0.05:
            candidate[random.choice(np.arange(P))] = get_random_persona()
    return pool
    

In [None]:
best_finds = []
best_fitnesses = []
for rst in range(5):
    avg_fitness = []
    pool = get_initial_pool()
    for iter in range(1000):
        parents, fitnesses = get_best_k(pool)
        avg_fitness.append(np.mean(fitnesses))
        if iter%100 == 0:
            print(f'Iteration {iter} : {avg_fitness[-1]}')
        children = mate(parents)
        children = mutate(children)
        pool = parents + children

    plt.plot(avg_fitness, label=f'run {rst}')

    best_candidate, best_fittness = get_best_k(pool,1)
    best_candidate = (best_candidate[0])
    result = [[int(idx) for idx in cand] for cand in best_candidate]
    best_finds.append(result)
    best_fitnesses.append(best_fittness)

plt.legend()
best_rst = np.array(best_fitnesses).argmax()
candidate = best_finds[best_rst]
fittness_matrix(candidate)

In [None]:
persona = {}
for i,cand in enumerate(result):
    persona['persona'+str(i)] = {a:idx for a,idx in zip(activities, cand)}

with open('data/personaBasedSchedules/optimized_persona.json', 'w') as f:
    json.dump(persona, f, indent=4)

In [None]:
ScheduleDistributionSampler(type='persona0').plot()
ScheduleDistributionSampler(type='persona1').plot()
ScheduleDistributionSampler(type='persona2').plot()
ScheduleDistributionSampler(type='persona3').plot()
ScheduleDistributionSampler(type='persona4').plot()