In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import compress

import datetime
from dateutil.parser import parse

import math
import os
import copy
import pickle

## Load in data

In [2]:
# Read in data (from pickle file)
file = open('train_set_filtered','rb')
train_set = pickle.load(file)

file = open('test_set_filtered','rb')
test_set = pickle.load(file)

# Combine training/testing data into one dataframe
data = pd.concat([train_set, test_set])

In [3]:
start_date = data.iloc[0,:]['date'] # Get oldest date
end_date = data.iloc[-1,:]['date'] # Get most recent date
elapsed_time = (end_date-start_date).days # Get elapsed time in days

## Get emission probabilities

In [17]:
# Function gets tally of drugs sold for given vendor
def get_Drugs4Vendor(vendor_name, data_df=data, fv='vendor_name', fd='drug_prediction'):
    # Input: data_df(dataframe), vendor_name(string), features_name*2(string).
    # Returns: series, representing proportion of total for each drug
    vendor_subset = data_df[data_df[fv] == vendor_name]
    vendor_drug_tally = vendor_subset[fd].groupby(vendor_subset[fd]).count()
    return(vendor_drug_tally/np.sum(vendor_drug_tally))

# Create emission probability table
def build_EmsProbTable(rows, cols, row_name='Vendor_Name'):
    # Input: rows/cols(list)-rows and columns for table, row_name(string).
    # Return: emission probability table(dataframe)
    ems_prob_table = pd.DataFrame(columns = cols) # Initilize the table with "cols" columns
    ems_prob_table.insert(0, row_name, rows) # Insert vendor list in 1st column
    ems_prob_table.iloc[:,1:] = ems_prob_table[row_name].apply(get_Drugs4Vendor)
    # Sort by Vendor name and fill NA values with 0
    ems_prob_table = ems_prob_table.sort_values(by = [row_name])
    ems_prob_table = ems_prob_table.fillna(0)
    ems_prob_table = ems_prob_table.set_index('Vendor_Name')
    return ems_prob_table

# Shift the ems_prob_table (dataframe) to array format
def convert_EmsProbTable(ems_prob_table):
    # Input: emission probability table(dataframe)
    # Return: emission probability(array)
    ems_prob = np.array(ems_prob_table)
    ems_prob = np.array(list(ems_prob[:, :]), dtype=np.float) # convert to float
    return ems_prob

In [18]:
# Get the list of vendors and drugs
vendor_list = data['vendor_name'].unique()
drug_list = data['drug_prediction'].unique()

# Compute emission probability
ems_prob_table = build_EmsProbTable(vendor_list, drug_list, row_name='Vendor_Name')
ems_prob = convert_EmsProbTable(ems_prob_table)
ems_prob.shape

(190, 15)

## Compute lambda values for each vendor, for exponential distribution

In [4]:
# Get lambda value (rate parameter) for each vendor: 1/E(x) for each
# I.e. compute number of transactions per day
lambda_vals = data['vendor_name'].groupby(data['vendor_name']).count()/elapsed_time

## Functions to generate random sequence

In [194]:
import datetime

def get_exp_rv(lam):
    # Function generates exponential random variable using inverse sampling
#     Args: lam is the rate parameter for the distribution
    U = np.random.uniform(low = 0, high = 1, size = lam.shape)
    X = (-1/lam) * np.log(U)
    return(X)

# Might need to generate different sequence for each month (distribution not constant over time)
# Function to generate artificial vendor sequence
def gen_rand_vendor_seq(n_samples = data.shape[0], start_date = datetime.datetime(2018,4,1)):
#     Args: n_samples is number of samples to generate
#           start_date is initial date of the sequence
    # Empty list to hold vendor sequence
    vendor_names = []

    # Empty list to hold time stamp sequence
    time_stamps = []
    prev = start_date # intialize time stamp
    
    # Empty list to hold drug sequence
    drugs = []
    drug_cats = list(ems_prob_table) # drug categories
    
    # For each sample
    for idx in range(n_samples):
        if idx % 1000 == 0:
            print('Finished generating', idx, 'samples.')
        
        # Generate exponential random variable (time value) for each lambda value (corresponding to each vendor)
        times = get_exp_rv(lambda_vals)
        vendor = times.idxmin()
        vendor_names.append(vendor) # get the name of the vendor with smallest time value
        
        # Get the timestamp of the vendor
        stamp = prev + datetime.timedelta(times.min())
        time_stamps.append(stamp) # append timestamp to list of time stamps
        prev = stamp
        
        # Get the drug
        drug_idx = np.random.multinomial(1, ems_prob_table.loc[vendor]).argmax()
        drugs.append(drug_cats[drug_idx])
        
    # Return dataframe with vendor names and corresponding time_stamps
    df = pd.DataFrame({'time_stamp' : time_stamps, 'vendor_name': vendor_names, 'drug' : drugs})
    return(df)

## Generate the sequence

In [195]:
import time
time1 = time.time()
df = gen_rand_vendor_seq(36000)
time2 = time.time()

Finished generating 0 samples.
Finished generating 1000 samples.
Finished generating 2000 samples.
Finished generating 3000 samples.
Finished generating 4000 samples.
Finished generating 5000 samples.
Finished generating 6000 samples.
Finished generating 7000 samples.
Finished generating 8000 samples.
Finished generating 9000 samples.
Finished generating 10000 samples.
Finished generating 11000 samples.
Finished generating 12000 samples.
Finished generating 13000 samples.
Finished generating 14000 samples.
Finished generating 15000 samples.
Finished generating 16000 samples.
Finished generating 17000 samples.
Finished generating 18000 samples.
Finished generating 19000 samples.
Finished generating 20000 samples.
Finished generating 21000 samples.
Finished generating 22000 samples.
Finished generating 23000 samples.
Finished generating 24000 samples.
Finished generating 25000 samples.
Finished generating 26000 samples.
Finished generating 27000 samples.
Finished generating 28000 samples

## Split into training and testing set

In [205]:
# Get total number of samples, and number of train/test samples
n_samples = df.shape[0]
n_train = round(n_samples * .75)

# Create training/test set
train = df.iloc[:n_train,:]
test = df.iloc[n_train:,:]

## Export to file

In [211]:
# Export to file
import pickle
file = open('art_data.pkl','wb')
pickle.dump(df, file)
file.close()

file = open('art_train.pkl','wb')
pickle.dump(train, file)
file.close()

file = open('art_test.pkl','wb')
pickle.dump(test, file)
file.close()