In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import compress

import datetime
from dateutil.parser import parse

import math
import os
import copy
import pickle

import drugLookup

In [4]:
# Read in data (from pickle file)
file = open('train_set_filtered','rb')
train_set = pickle.load(file)

file = open('test_set_filtered','rb')
test_set = pickle.load(file)

In [5]:
d_train = list(train_set['drug_prediction'])
v_train = list(train_set['vendor_name'])

d_test = list(test_set['drug_prediction'])
v_test = list(test_set['vendor_name'])

## Compute $P(t~|~v)$

In [6]:
from datetime import datetime, date, time, timedelta

start = time(0,0,0,0) # first interval start date
interval = 30 # length of interval (minutes)

def addTimes(start_time, duration):
    # Function to add minutes to datetime.time() object
    dt = datetime.combine(date.today(), start_time) # create arbitrary date
    return((dt + timedelta(minutes = duration)).time())

#  Function to check if time is in a time interval
def timeInInterval(start_time, check_time, duration):
    end_time = addTimes(start_time, duration)
    cond1 = start_time <= check_time
    cond2 = check_time < end_time
    return(cond1 and cond2)

# Function to classify given time into one of given bins
def binTime(time1, start_times, duration = interval):
    time_bin = [start_time for start_time in start_times if timeInInterval(start_time, time1.time(),duration)]
    if len(time_bin) == 0:
        return(start_times[-1])
    else:
        return(time_bin[0])
    
# Create list of time intervals
# Get list of interval start dates
n_intervals = int(24*60/interval) # compute number of intervals in 24 hour period
slide_amount = interval # amount to slide window (if no overlap desired, set equal to interval)
    
# Get all intervals in 24 hour period
interval_starts = [addTimes(start_time = start, duration = slide_amount * n) for n in range(n_intervals)]

In [7]:
# For given vendor, count the number of transactions in each time_bin
train_set['time_bin'] = train_set['date'].apply(binTime, start_times = interval_starts)

In [8]:
# Get probability for each vendor given a specific time
def getVendorsForTime(time_bin):
#     Args: Name of a vendor (string)
#     Returns: series, representing proportion of total for each drug
    time_subset = train_set[train_set['time_bin'] == time_bin]
    tally = time_subset['vendor_name'].groupby(time_subset['vendor_name']).count()
    return(tally/np.sum(tally))

In [9]:
# Get list of vendors
_ , vendor_list = pd.factorize(v_train, sort = True)

#Create inverse emission probability dataframe
pvt = pd.DataFrame(columns = vendor_list)
#Insert vendor name column
pvt.insert(0, "time_bin", interval_starts)
# Apply function to dataframe
pvt.iloc[:,1:] = pvt['time_bin'].apply(getVendorsForTime)
# Sort by Vendor name and fill NA values with 0
pvt = pvt.fillna(0)
# Set index to be time
pvt = pvt.set_index('time_bin')

In [10]:
# For time values with no samples, set probability equal to probability of vendor overall (?)
# sum(pvt.iloc[7,1:])
# pvt.head()
# pvt['time_bin']

## Make Bayes predictions

In [11]:
# Get prior probabilities (i.e., probability of each vendor)
vendor_probs = train_set['vendor_name'].value_counts()/np.sum(train_set['vendor_name'].value_counts())
vendor_probs = vendor_probs.sort_index()

In [12]:
# Create dictionary with best prediction for each drug
pvt_dict = {time_bin : list(pvt.loc[time_bin].sort_values(ascending = False)[:10].index) for time_bin in pvt.index}

In [13]:
# Function to predict class for each drug in test set
def bayesPredict(obs_seq):
    preds = [pvt_dict[obs] for obs in obs_seq]
    return(preds)

# Get times for testing
test_set['time_bin'] = test_set['date'].apply(binTime, start_times = interval_starts)
t_test = list(test_set['time_bin'])

# Make predictions
test_set['vendor_pred'] = bayesPredict(t_test)

In [14]:
# Compute 'top n' accuracy
def get_top_n(df, n):
    q = zip(df['vendor_name'], df['vendor_pred'])
    if n == 1:
        ar = [samp[0] == samp[1][0] for samp in q]
    else:
        ar = [samp[0] in samp[1][:n] for samp in q]
    acc = np.sum(ar)/len(ar)
    return(round(acc,4))

In [15]:
print('Time interval for bin:',interval, 'minutes.')
print('Top 1 accuracy :',get_top_n(test_set, 1))
print('Top 2 accuracy :',get_top_n(test_set, 2))
print('Top 3 accuracy :',get_top_n(test_set, 3))
print('Top 5 accuracy :',get_top_n(test_set, 5))
print('Top 10 accuracy:',get_top_n(test_set, 10))

Time interval for bin: 30 minutes.
Top 1 accuracy : 0.0816
Top 2 accuracy : 0.1444
Top 3 accuracy : 0.2023
Top 5 accuracy : 0.3025
Top 10 accuracy: 0.5261


## Perfect prediction

In [6]:
_, uniques = pd.factorize(train_set['date'])

In [7]:
len(uniques)

17570

In [8]:
len(train_set)

24553

In [9]:
24*60

1440