In [8]:
import numpy as np

In [148]:
class DataGenerator:
    def __init__(self, size, path_to_file=None, confusion_rate=0.07):
        self.path_to_file = path_to_file
        self.size = size
        self.confustion_rate = confusion_rate
        self.mean = [.0, .0]
        self.cov = [[1., .0], [.0, 2.0]]
        self.datapoints = [] 
        self.cls_distributions = [self.__build_c_dist(x) for x in [[0],[1,3],[1,2],[2,3]]] 
        self.open_dist = [[0,0.7, 0,0], [1, 0, 9, 1], [1, 0, 13, 3], [1, 0, 21, 2]]
        self.max_num_of_emails = 15
    
    def __get_c_dist_index(self, x, y, z):
        if x < 0:
            if z == 0 or z == 3:
                return 0
            else:
                return 1
        else:
            if y > 0:
                return 2
            else:
                return 3
    def __build_c_dist(self, classes):
        eps_max = (1. - self.confustion_rate) / len(classes)
        eps_min =  self.confustion_rate / (4 - len(classes))
        p = [eps_min] *  4
        for i in classes:
            p[i] = eps_max
        return p
    
    def __gen_open_time(self, t, c):
        dist = self.open_dist[c]
        if dist[0] == 0:
            topen = t + np.random.exponential(dist[1])
        else:
            topen = np.random.normal(dist[2], dist[3])
        if topen > 24:
            topen -= 24
        return topen
       
    def __time_to_s(self, t):
        h = int(t)
        m = int((t - h)*60)
        return ("%02d:%02d" % (h,m))
    
    def __call__(self):
        x, y = np.random.multivariate_normal(self.mean, self.cov, self.size).T
        z = np.random.choice(4, self.size)
        m = np.random.choice(self.max_num_of_emails, self.size) + 1
        with open(self.path_to_file, 'w') as f:
            head = ["X1", "X2", "X3", "M"] + ["TS%02d,TO%02d" % (t, t) for t in range(self.max_num_of_emails)]
            f.write(",".join(head) + "\n")
            for i in range(self.size):
                p = self.cls_distributions[self.__get_c_dist_index(x[i], y[i], z[i])] 
                dp = [round(x[i], 2), round(y[i], 2), z[i], m[i]]
                for j in range(m[i]):
                    c = np.random.choice(4, p=p)
                    t = np.random.choice(14) + 7
                    t_s = self.__time_to_s(t) 
                    t_o =  self.__gen_open_time(t, c)
                    t_o_s = self.__time_to_s(t_o)
                    dp.append(t_s)
                    dp.append(t_o_s)
                if m[i] < self.max_num_of_emails:
                    dp += ['',''] * (self.max_num_of_emails -  m[i])
                line = ",".join([str(x) for x in dp]) + "\n"
                f.write(line)

    
    
    

In [154]:
gen = DataGenerator(100000, path_to_file="data.csv")

In [155]:
gen()

In [156]:
import pandas

In [157]:
df = pandas.read_csv("data.csv")

In [158]:
df

Unnamed: 0,X1,X2,X3,M,TS00,TO00,TS01,TO01,TS02,TO02,...,TS10,TO10,TS11,TO11,TS12,TO12,TS13,TO13,TS14,TO14
0,-1.25,-0.88,2,14,14:00,22:27,18:00,14:37,13:00,09:39,...,10:00,21:29,10:00,08:47,15:00,10:06,10:00,09:14,,
1,-0.14,-3.21,0,1,17:00,17:11,,,,,...,,,,,,,,,,
2,1.23,0.08,3,1,19:00,09:22,,,,,...,,,,,,,,,,
3,-0.38,-1.30,3,2,12:00,12:38,08:00,09:04,,,...,,,,,,,,,,
4,-1.50,0.22,2,6,14:00,21:59,11:00,17:01,15:00,23:47,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,-0.53,-3.02,0,3,08:00,08:37,11:00,11:00,08:00,08:15,...,,,,,,,,,,
99996,-1.37,0.71,3,7,07:00,07:16,20:00,20:05,11:00,12:42,...,,,,,,,,,,
99997,-1.27,-0.70,3,11,10:00,10:01,16:00,16:24,12:00,12:49,...,09:00,09:38,,,,,,,,
99998,-0.62,2.95,1,4,07:00,09:07,08:00,21:46,09:00,09:33,...,,,,,,,,,,


In [159]:
?
