In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Simulating the Markov Chain for a population of people (Data Generation)

def datagen(starting_populations, transition_matrix):

    time_steps = transition_matrix.shape[0]
    num_states = starting_populations.shape[1]
    num_days = starting_populations.shape[0]

    states = np.arange(num_states)

    populations = []
  
    for d in range(num_days):
        day_d = []
        day_d.append(starting_populations[d])
        prev_state_pop = starting_populations[d]
        
        for i in range(time_steps):
            states_i = np.zeros((num_states,))
            for j in range(transition_matrix[i].shape[0]):
                outcomes = np.random.choice(states, size=int(prev_state_pop[j]), p = transition_matrix[i, j])
                unique_values, counts = np.unique(outcomes, return_counts=True)
                
                for x, k in enumerate(unique_values):
                    states_i[k] += counts[x]
                
            # print("Populations at ", i+1, " o' clock, on day ", d, ": ", states_i)
            day_d.append(states_i)
            prev_state_pop = states_i
        populations.append(day_d)
        
    return np.array(populations)

In [3]:
# Creating the starting populations

x = 10  # Number of Datapoints
n = 3   # Number of States
k = 3000    # Average Total population

data_points = np.zeros((x, n), dtype=int)

for i in range(x):
    err = np.round(np.random.normal(0, 20)) # Adding Gaussian Noise
    k_new = k + err
    
    random_values = np.random.randint(0, k_new, n - 1)
    random_values = np.concatenate(([0], random_values, [k_new]))
    random_values.sort()

    data_points[i] = np.diff(random_values)

total_sum = np.sum(data_points, axis=1)

print("Generated data points:\n", data_points)
print("Sum of features in each data point:", total_sum)

Generated data points:
 [[1473  425 1089]
 [2020  115  901]
 [1265  599 1159]
 [1382   11 1578]
 [  95   29 2901]
 [2502  266  210]
 [ 611  942 1471]
 [1530 1027  435]
 [1589  154 1297]
 [ 215 2111  673]]
Sum of features in each data point: [2987 3036 3023 2971 3025 2978 3024 2992 3040 2999]


In [4]:
# data -> (day, time (2), state)

starting_populations = data_points
print(starting_populations.shape)

# Defining the transition matrices at the different times

transition_matrix_1 = np.array([[
    [0.3, 0.1, 0.6],
    [0.1, 0.3, 0.6],
    [0.1, 0.1, 0.8]
]])

transition_matrix_2 = np.array([[
    [0.1, 0.4, 0.5],
    [0.1, 0.5, 0.4],
    [0.1, 0.3, 0.6]
]])

transition_matrix_3 = np.array([[
    [0.15, 0.05, 0.8],
    [0.05, 0.15, 0.8],
    [0.05, 0.05, 0.9]
]])

# print(transition_matrix_1.shape)

data_1 = datagen(starting_populations, transition_matrix_1)
data_2 = datagen(starting_populations, transition_matrix_2)
data_3 = datagen(starting_populations, transition_matrix_3)

np.save('data_1.npy', data_1)
np.save('data_2.npy', data_2)
np.save('data_3.npy', data_3)

print(data_1.shape)


(10, 3)
(10, 2, 3)
