In [1]:
%matplotlib inline
from __future__ import division
from __future__ import print_function

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import utils
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics
from time import time

In [2]:
visitors, events, devices, url_categories = utils.load_data(event_categories=True)
#categorized_events = utils.categorize_events(events, url_categories)

  if self.run_code(code, result):
  if self.run_code(code, result):


In [3]:
def define_use_case(event):
    if event.site_category == 'Publisher':
        return 'Publication Research'
    elif event.site_category in ['Reference Tool', 'Medical Education', 'Multi-Channel Marketing']:
        return 'Education/Tools'
    elif event.site_category in ['Pharma', 'Med Device']:
        if event.site_sub_category.strip() in ['Branded HCP', 'Branded Consumer']:
            return 'Branded Pharma'
        elif event.site_sub_category.strip() in ['Unbranded HCP', 'Unbranded Consumer']:
            return 'Unbranded Pharma'
    elif event.site_category in ['Professional Social', 'Medical Association', 'Recruiter']:
        return 'Professional/Social Media'
    else:
        return 'Other'

In [4]:
events['use_case'] = events.apply(lambda x: define_use_case(x), axis=1)
# this is very slow lol

In [5]:
use_cases = {
    'publication_research' : ['Publisher'],
    'education_tools' : ['Reference Tool', 'Medical Education', 'Multi-Channel Marketing'],
    'pharma' : ['Pharma', 'Med Device'],
    'social_professional' : ['Professional Social', 'Medical Association', 'Recruiter']
}

for u in use_cases:
    events.loc[events.site_category.isin(use_cases[u]), 'use_case'] = u
events.loc[events.use_case=='', 'use_case'] = 'other'

In [6]:
# 3 dimensions likely more useful for cluster analysis

publisher_tools = ['Publisher', 'Reference Tool', 'Medical Education', 'Multi-Channel Marketing']
pharma = ['Pharma', 'Med Device']
social = ['Professional Social', 'Medical Association', 'Recruiter']

events['use_case'] = ''
events.loc[events.site_category.isin(publisher_tools), 'use_case'] = 'publications_ed_tools'
events.loc[events.site_category.isin(pharma), 'use_case'] = 'pharma_device'
events.loc[events.site_category.isin(social), 'use_case'] = 'professional_social_media'
events.loc[events.use_case=='', 'use_case'] = 'other'

In [7]:
sessions = events.groupby('session_id').event_id.count().reset_index()
sessions = sessions.rename(columns={'event_id':'page_views'})
event_sessions = events.drop_duplicates('session_id')
event_sessions = pd.merge(event_sessions, sessions, on='session_id')
event_sessions = pd.merge(visitors, event_sessions, on='dg_id')

# export to tableau (if needed)
#tableau_sessions = event_sessions[['timestamp', 'dg_id', 'npi_number', 'primary_specialty', 'site_category', 'site_sub_category', 'disease_category', 'disease', 'pharma_firm', 'use_case', 'page_views']]
#tableau_sessions.to_csv('../data/use_case_sessions.csv')

### PERSONAS

In [8]:
personas = event_sessions.groupby('dg_id').use_case.apply(list).reset_index()
personas = pd.merge(personas, visitors[['dg_id', 'primary_specialty']], on='dg_id')
personas['total_sessions'] = personas.use_case.apply(lambda x: len(x))
personas['use_case_counts'] = personas.use_case.apply(lambda x: Counter(x))

In [9]:
personas['pharma'] = personas.use_case_counts.apply(lambda x: x['pharma_device'])
personas['publications'] = personas.use_case_counts.apply(lambda x: x['publications_ed_tools'])
personas['social'] = personas.use_case_counts.apply(lambda x: x['professional_social_media'])

In [10]:
personas['total_3d'] = personas.apply(lambda x: x.pharma + x.publications + x.social, axis=1)
# exclude folks with only 'other' category (will address eventually)
personas = personas[personas.total_3d > 0]

### Cluster without specialty

In [11]:
# normalize per user
personas['pharma_pct'] = personas.apply(lambda x: x.pharma/x.total_3d, axis=1)
personas['pubs_pct'] = personas.apply(lambda x: x.publications/x.total_3d, axis=1)
personas['social_pct'] = personas.apply(lambda x: x.social/x.total_3d, axis=1)

In [12]:
personas.head()

Unnamed: 0,dg_id,use_case,primary_specialty,total_sessions,use_case_counts,pharma,publications,social,total_3d,pharma_pct,pubs_pct,social_pct
1,R0000000026,[publications_ed_tools],,1,{'publications_ed_tools': 1},0,1,0,1,0.0,1.0,0.0
2,R0000000098,[publications_ed_tools],,1,{'publications_ed_tools': 1},0,1,0,1,0.0,1.0,0.0
3,R0000000358,[professional_social_media],,1,{'professional_social_media': 1},0,0,1,1,0.0,0.0,1.0
4,R0000000651,"[publications_ed_tools, publications_ed_tools,...",,20,{'publications_ed_tools': 20},0,20,0,20,0.0,1.0,0.0
5,R0000000720,[publications_ed_tools],,1,{'publications_ed_tools': 1},0,1,0,1,0.0,1.0,0.0


In [55]:
# rotate plot
theta = np.arctan(np.sqrt(2))
personas['pharma_pct_rot'] = personas.apply(lambda x: -np.sqrt(2)/2*(x.pharma_pct-x.pubs_pct), axis=1)
personas['pubs_pct_rot'] = personas.apply(lambda x: (-np.sqrt(2)/2*np.cos(theta)*(x.pharma_pct+x.pubs_pct)+x.social_pct*np.sin(theta)+.408248), axis=1)
personas['social_pct_rot'] = personas.apply(lambda x: np.sqrt(2)/2*(x.pharma_pct*np.sin(theta)+x.pubs_pct*np.sin(theta))+x.social_pct*np.cos(theta)-0.57735, axis=1)


In [56]:
personas.head(30)

Unnamed: 0,dg_id,use_case,primary_specialty,total_sessions,use_case_counts,pharma,publications,social,total_3d,pharma_pct,pubs_pct,social_pct,pharma_pct_rot,pubs_pct_rot,social_pct_rot
1,R0000000026,[publications_ed_tools],,1,{'publications_ed_tools': 1},0,1,0,1,0.0,1.0,0.0,0.707107,-2.904639e-07,2.691896e-07
2,R0000000098,[publications_ed_tools],,1,{'publications_ed_tools': 1},0,1,0,1,0.0,1.0,0.0,0.707107,-2.904639e-07,2.691896e-07
3,R0000000358,[professional_social_media],,1,{'professional_social_media': 1},0,0,1,1,0.0,0.0,1.0,-0.0,1.224745,2.691896e-07
4,R0000000651,"[publications_ed_tools, publications_ed_tools,...",,20,{'publications_ed_tools': 20},0,20,0,20,0.0,1.0,0.0,0.707107,-2.904639e-07,2.691896e-07
5,R0000000720,[publications_ed_tools],,1,{'publications_ed_tools': 1},0,1,0,1,0.0,1.0,0.0,0.707107,-2.904639e-07,2.691896e-07
6,R0000000823,[publications_ed_tools],,1,{'publications_ed_tools': 1},0,1,0,1,0.0,1.0,0.0,0.707107,-2.904639e-07,2.691896e-07
7,R0000000996,"[publications_ed_tools, publications_ed_tools,...",,17,{'publications_ed_tools': 17},0,17,0,17,0.0,1.0,0.0,0.707107,-2.904639e-07,2.691896e-07
8,R0000002204,"[professional_social_media, professional_socia...",,3,{'professional_social_media': 3},0,0,3,3,0.0,0.0,1.0,-0.0,1.224745,2.691896e-07
9,R0000002268,"[pharma_device, publications_ed_tools]",,2,"{'pharma_device': 1, 'publications_ed_tools': 1}",1,1,0,2,0.5,0.5,0.0,-0.0,-2.904639e-07,2.691896e-07
10,R0000002822,"[professional_social_media, professional_socia...",,2,{'professional_social_media': 2},0,0,2,2,0.0,0.0,1.0,-0.0,1.224745,2.691896e-07


In [59]:
tableau_personas = personas[['primary_specialty', 'pharma', 'publications', 'social', 'pharma_pct_rot', 'pubs_pct_rot', 'social_pct_rot']]

In [60]:
tableau_personas.to_csv('../../data/personas_rot.csv')

In [9]:
df = personas[['pharma_pct', 'pubs_pct', 'social_pct']]
#dfx = personas[personas.total_3d>5][['pharma_pct', 'pubs_pct', 'social_pct']]

In [13]:
scatter = dict(
    mode = "markers",
    name = "y",
    type = "scatter3d",    
    x = dfx['pharma_pct'], y = dfx['pubs_pct'], z = dfx['social_pct'],
    marker = dict( size=2, color="rgb(23, 190, 207)" )
)

layout = dict(
    title = '3d point clustering',
    scene = dict(
        xaxis = dict(title = "pharma"),
        yaxis = dict(title = "publications"),
        zaxis = dict(title = "social"),
    )
)
fig = dict( data=[scatter], layout=layout )
# Use py.iplot() for IPython notebook
plotly.plotly.iplot(fig, filename='Use Cases')

### Specialties included

In [19]:
specialty_map = {
    'Internal Medicine (IM)': 'pcp',
    'Family Medicine (FM)': 'pcp',
    'Family Medicine': 'pcp',
    'Psychiatry (P)': 'specialist',
    'Pediatrics (PD)': 'pcp',
    'Cardiovascular Disease (CD)': 'specialist',
    'Obstetrics & Gynecology (OBG)': 'pcp',
    'Hematology/Oncology (HO)': 'oncology',
    'Dermatology (D)': 'specialist',
    'Medical Oncology (ON)': 'oncology',
    'Endocrinology, Diabetes, Metabolism (END)': 'specialist',
    'Neurology (N)':'specialist',
    'Emergency Medicine (EM)': 'pcp',
    'Anesthesiology (AN)': 'specialist',
    'General Surgery (GS)': 'specialist',
    'General Practice (GP)': 'pcp'
}
personas['specialty_group'] = personas.primary_specialty.apply(lambda x: specialty_map[x] if x in specialty_map else '')
dfs = personas[personas.specialty_group<>''][['specialty_group', 'pharma_pct', 'pubs_pct', 'social_pct']]

In [20]:
# Learn about API authentication here: https://plot.ly/pandas/getting-started
# Find your api_key here: https://plot.ly/settings/api

import plotly.plotly as py
import plotly.graph_objs as go
import pandas as pd

data = []
clusters = []
colors = ['rgb(228,26,28)','rgb(55,126,184)','rgb(77,175,74)']

for i in range(len(dfs['specialty_group'].unique())):
    name = dfs['specialty_group'].unique()[i]
    color = colors[i]
    x = dfs[ dfs['specialty_group'] == name ]['pharma_pct']
    y = dfs[ dfs['specialty_group'] == name ]['pubs_pct']
    z = dfs[ dfs['specialty_group'] == name ]['social_pct']
    
    trace = dict(
        name = name,
        x = x, y = y, z = z,
        type = "scatter3d",    
        mode = 'markers',
        marker = dict( size=3, color=color, line=dict(width=0) ) )
    data.append( trace )

layout = dict(
    width=800,
    height=550,
    autosize=False,
    title='Sessions by Specialty',
    scene=dict(
        xaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)',
            title='pharma'
        ),
        yaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)',
            title='pubs'
        ),
        zaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)',
            title='social'
        ),
        aspectratio = dict( x=1, y=1, z=0.7 ),
        aspectmode = 'manual'        
    ),
)


fig = dict(data=data, layout=layout)

# IPython notebook
py.iplot(fig, filename='pandas-3d-iris', validate=False)

#url = py.plot(fig, filename='pandas-3d-iris', validate=False)

The draw time for this plot will be slow for clients without much RAM.



Estimated Draw Time Slow

