Predict vendor from artificially-generated sequence using Bayes' method.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import compress

import datetime
from dateutil.parser import parse

import math
import os
import copy
import pickle

In [2]:
# Import data
import pickle
file = open('art_data.pkl','rb')
data = pickle.load(file)
file.close()

file = open('art_train.pkl','rb')
train = pickle.load(file)
file.close()

file = open('art_test.pkl','rb')
test = pickle.load(file)
file.close()

## "Train" Bayes' model

In [170]:
# Get probability for a variable given another variable
def get_cond_probDist(category, var = 'vendor_name', given_var = 'drug'):
#     Function gets probability distribution of 'var' given 'given_var' is 'category'
#     Args: 'category' is the class of given_var
#           'var' is the random variable for which the prob. dist. is computed
#           'given_var' is the random variable which is given
#     Returns: series, representing proportion of total for each drug
    subset = train[train[given_var] == category]
    tally = subset[given_var].groupby(subset[var]).count()
    return(tally/np.sum(tally))

# Create dictionary to predict using Bayes' Rule (i.e. using conditional probability)
def trainBayes(var = 'vendor_name', given_var = 'drug'):
#     Function returns a dictionary with sorted probabilities for 'var' given 'given_var'
#     Args: 'var' is the name of the variable to predict
#           'given_var' is the name of the given variable
#     Returns: dictionary with keys for each possible outcome of 'given_var', and values corresponding to 
#                 sorted probabilities for each outcome of 'var'
    
    # Get list of unique vendors and of unique drugs
    cols = train[var].unique()
    rows = train[given_var].unique()

    #Create conditional probability dataframe
    cond_prob_df = pd.DataFrame(columns = cols)
    
    #Insert given variable name column
    cond_prob_df.insert(0, given_var, rows)
    
    # Apply function to dataframe
    cond_prob_df.iloc[:,1:] = \
        cond_prob_df[given_var].apply(get_cond_probDist, var = 'vendor_name', given_var = 'drug')
    
    # fill NA values with 0
    cond_prob_df = cond_prob_df.fillna(0)
    
    # Set index to be given variable
    cond_prob_df = cond_prob_df.set_index(given_var)
    
    # Create dictionary with sorted predictions for each outcome of given variable
#     First key is one of drugs
#     Second key is either 'vendor' or 'prob'
#     'vendor' are the most probable vendors, sorted by probability
#     'prob' corresponds to the probability for each vendor
    bayes_dict = {drug : {'vendor':list(cond_prob_df.loc[drug].sort_values(ascending = False).index),\
                          'prob':list(cond_prob_df.loc[drug].sort_values(ascending = False).values)} \
                  for drug in pvd.index}    

    return(bayes_dict)

## Evaluate model

In [171]:
bayes_model = trainBayes()

# Function to predict class for each drug in test set
def bayesPredict(obs_seq):
    preds = [bayes_model[obs]['vendor'] for obs in obs_seq]
    return(preds)

# Get times for testing
d_test = list(test['drug'])

# Make predictions
test['vendor_pred'] = bayesPredict(d_test)

In [172]:
# Compute 'top n' accuracy
def get_top_n(n,df=test):
    q = zip(df['vendor_name'], df['vendor_pred'])
    if n == 1:
        ar = [samp[0] == samp[1][0] for samp in q]
    else:
        ar = [samp[0] in samp[1][:n] for samp in q]
    acc = np.sum(ar)/len(ar)
    return(round(acc,4))

In [173]:
n_vals = [1,5,10]
for n in n_vals:
    print('Top',n,'accuracy:',get_top_n(n))

Top 1 accuracy: 0.1212
Top 5 accuracy: 0.4303
Top 10 accuracy: 0.6517
