In [186]:
%matplotlib tk

In [187]:
import pandas 
import matplotlib.pyplot as plt
import datetime as dt
from typing import Set


In [188]:
df = pandas.read_csv(
    '../../data/processed/all_transactions.csv',
    index_col=0
)

In [189]:
df = df[df['src'] != 'endocrinologists'].copy()

In [190]:
df.loc[:, 'payment_ts'] = df['date_of_payment'].dropna().apply(lambda s: dt.datetime.strptime(s, '%m/%d/%Y'))

In [191]:
_ids_set: Set[str] = {i for i in df['applicable_manufacturer_or_applicable_gpo_making_payment_id'].unique() if not pandas.isna(i)}
ids_to_names = {
    k: df.loc[
        df['applicable_manufacturer_or_applicable_gpo_making_payment_id'] == k, 
        'applicable_manufacturer_or_applicable_gpo_making_payment_name'
    ].iloc[0] for k in _ids_set
}
del _ids_set

In [192]:
# for each date, how many doctors are *initially*
# contacted by company reps
doctor_counts = df\
    .dropna(subset=['physician_profile_id'])\
    .groupby(['applicable_manufacturer_or_applicable_gpo_making_payment_id', 'physician_profile_id'])\
    .agg(
        {
            'payment_ts': 'min'
        }
    ) # get min contact timestamp for each doctor

In [194]:
doctor_counts['physician_count'] = 1

In [195]:
# get total # doctors concatenated for each date
doctor_counts = doctor_counts\
    .groupby(['applicable_manufacturer_or_applicable_gpo_making_payment_id', 'payment_ts'])\
    .sum()

In [207]:
start_date = pandas.Timestamp(year=2018, day=1, month=1) - dt.timedelta(days=1)

In [197]:
payment_ids = doctor_counts.index.get_level_values(0)
for i in payment_ids:
    doctor_counts.loc[(i, start_date), :] = 0

In [198]:
doctor_counts.sort_index(inplace=True)

In [199]:
doctor_counts['physician_count_cumulative'] = doctor_counts\
    .groupby('applicable_manufacturer_or_applicable_gpo_making_payment_id')\
    .apply(lambda df: df.sort_index().cumsum())


In [200]:
fig1, ax1 = plt.subplots()
fig1.set_size_inches(12, 8)

initial_doctors = len(df.groupby('physician_profile_id')) \
        + pandas.isna(df['physician_profile_id']).sum()

for i, (company_id, doctor_count_grouped_by_company) in \
    enumerate(doctor_counts.groupby('applicable_manufacturer_or_applicable_gpo_making_payment_id')):

    company_name = ids_to_names[company_id]

    doctor_count_grouped_by_company: pandas.DataFrame = doctor_count_grouped_by_company.loc[company_id]
    ax1.step(
        doctor_count_grouped_by_company.index, 
        initial_doctors - doctor_count_grouped_by_company['physician_count_cumulative'],
        label=company_name,
        color = plt.get_cmap('tab20')(i)
    )

ax1.set_ylim([0, None])
ax1.set_title('Unpaid doctors over time')
ax1.legend()
#fig1.savefig("images/doctor_count_curve.png", dpi=1000)
#print(f"doctors went from \
    #{initial_doctors} to \
    #{initial_doctors - doctor_count_grouped.at[doctor_count_grouped.index.max(), 'physician_count_cumulative']}")

<matplotlib.legend.Legend at 0x1283fb400>

In [201]:
_df_tmp = df.copy()
_df_tmp['payment_count'] = 0

In [224]:
payment_count_and_amt_grouped = _df_tmp\
    .dropna(subset=['physician_profile_id'])\
    .rename(columns={'total_amount_of_payment_usdollars': 'payment_amount'})\
    .groupby(['applicable_manufacturer_or_applicable_gpo_making_payment_id', 'payment_ts'])\
    .agg({'payment_count': 'count', 'payment_amount': 'sum'})

In [225]:
payment_ids = payment_count_and_amt_grouped.index.get_level_values(0)
for i in payment_ids:
    payment_count_and_amt_grouped.loc[(i, start_date), :] = [0, 0]

In [226]:
payment_count_and_amt_grouped.sort_index(inplace=True)

In [228]:

payment_count_and_amt_grouped = payment_count_and_amt_grouped\
    .groupby('applicable_manufacturer_or_applicable_gpo_making_payment_id')\
    .apply(lambda df: df.cumsum())

In [231]:
fig2, ax2 = plt.subplots()
fig2.set_size_inches(12, 8)

for i, (company_id, payments_grouped_by_company) in \
    enumerate(payment_count_and_amt_grouped.groupby('applicable_manufacturer_or_applicable_gpo_making_payment_id')):

    company_name = ids_to_names[company_id]

    payments_grouped_by_company: pandas.DataFrame = payments_grouped_by_company.loc[company_id]
    ax2.step(
        payments_grouped_by_company.index, 
        payments_grouped_by_company['payment_count'],
        label=company_name,
        color = plt.get_cmap('tab20')(i)
    )

ax2.legend()
ax2.set_title('Cumulative no. of payments over time')

#fig2.savefig("images/payment_count_curve.png", dpi=1000)

Text(0.5, 1.0, 'Cumulative no. of payments over time')

In [232]:
fig3, ax3 = plt.subplots()
fig3.set_size_inches(12, 8)

for i, (company_id, payments_grouped_by_company) in \
    enumerate(payment_count_and_amt_grouped.groupby('applicable_manufacturer_or_applicable_gpo_making_payment_id')):

    company_name = ids_to_names[company_id]

    payments_grouped_by_company: pandas.DataFrame = payments_grouped_by_company.loc[company_id]
    ax3.step(
        payments_grouped_by_company.index, 
        payments_grouped_by_company['payment_amount'],
        label=company_name,
        color = plt.get_cmap('tab20')(i)
    )

ax3.legend()
ax3.set_title('Cumulative payout (USD) over time')
ax3.ticklabel_format(axis='y', style='plain')

#fig3.savefig("images/dollars_curve.png", dpi=1000)