In [1]:
%matplotlib tk

In [2]:
import geopandas as gpd
import pandas
import us
from numpy import nan
import matplotlib.pyplot as plt




In [3]:
pop = pandas.read_excel(
    'https://www2.census.gov/programs-surveys/decennial/2020/data/apportionment/apportionment-2020-table02.xlsx',
    header=3
)

In [4]:
pop['state_code'] = pop['AREA'].apply(lambda x: getattr(us.states.lookup(x), 'fips', -1))

In [5]:
pop = pop\
    .drop(pop[pop['state_code'] == -1].index)\
    .drop('This cell is intentionally blank.', axis=1, errors='ignore')\
    .rename(columns={'RESIDENT POPULATION (APRIL 1, 2020)': 'pop', 'AREA': 'state'})\
    .reset_index(drop=True)

In [6]:
states = gpd.read_file('https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip')

In [7]:
states = states.to_crs(epsg=2163)

In [8]:
def crop(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    old_crs = gdf.crs
    gdf = gdf.to_crs(4269)
    return gdf.cx[-130:-68, :55].to_crs(old_crs) # crop to roughly 48 states

In [9]:
states = crop(states)

In [10]:
states = states.merge(pop, left_on='STATEFP', right_on='state_code')

In [11]:
paid_doctors = pandas.read_csv('/Users/eab06/Desktop/WJB/PythonProjects/HT_Data/data/processed/aggregated.csv')

In [12]:
paid_doctors['state_code'] = paid_doctors['state'].apply(lambda x: getattr(us.states.lookup(x), 'fips', -1))

In [13]:
paid_doctors = paid_doctors[paid_doctors['src'] != 'endocrinologists']

In [14]:
_paid_doctors_money = paid_doctors\
    .groupby('physician_profile_id')\
    ['total_money']\
    .sum()\
    .reset_index(drop=True)
_paid_doctors_money

0        36.81
1       164.98
2        25.86
3      9245.01
4        56.64
        ...   
548      28.12
549     209.79
550     124.65
551      39.45
552      25.86
Name: total_money, Length: 553, dtype: float64

In [15]:

paid_doctors = paid_doctors\
    .groupby('physician_profile_id')\
    .first()\
    .reset_index()

In [16]:
paid_doctors['dollars'] = _paid_doctors_money

In [17]:
states = states\
    .merge(paid_doctors.groupby('state_code')['dollars'].sum(),   left_on='STATEFP', right_index=True, how='left')\
    .merge(paid_doctors.groupby('state_code')['src'    ].count(), left_on='STATEFP', right_index=True, how='left')\
    .rename(columns={'dollars': 'total_payment', 'src': 'count'})

In [18]:
states['total_payment_per_million'] = (states['total_payment'] / states['pop'] * 1E6).fillna(0)
states['paid_doctors_per_million' ] = (states['count']         / states['pop'] * 1E6).fillna(0)

In [19]:
all_sample = pandas.read_csv(
    '/Users/eab06/Desktop/WJB/PythonProjects/HT_Data/data/processed/all_with_duplicates.csv',
    index_col=0
)

In [20]:
all_sample = all_sample[all_sample['src'] != 'endocrinologists']

In [21]:
all_sample = all_sample\
    .drop_duplicates(['first_name', 'last_name', 'state'])\
    .reset_index(drop=True)\
    .groupby('state')\
    .count()\
    .rename(columns={'src': 'count'})\
    ['count']

In [22]:
all_sample.index = all_sample.index.map(lambda x: getattr(us.states.lookup(x), 'fips', -1))

In [23]:
states = states.merge(all_sample, left_on='STATEFP', right_index=True, suffixes=(None, '_all_doctors'), how='left')

In [24]:
states['total_doctors_per_million'] = (states['count_all_doctors'] / states['pop'] * 1E6)


In [25]:
states['unpaid_doctors_per_million'] = states['total_doctors_per_million'] - states['paid_doctors_per_million']

In [26]:
vmax = states['paid_doctors_per_million']\
    .append(states['total_doctors_per_million'])\
    .append(states['unpaid_doctors_per_million'])\
    .max()
vmax = 5 # override

In [27]:
states.loc[
    (states['count_all_doctors'] == 0) | (pandas.isna(states['count_all_doctors'])), 
    ['total_doctors_per_million', 'unpaid_doctors_per_million', 'paid_doctors_per_million', 'total_payment_per_million']
] = nan

In [28]:
fig1, ax1 = plt.subplots()
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax1.set_title('Horizon-influenced doctors in sample per million by state')
states.plot(
    column='paid_doctors_per_million', 
    ax=ax1, 
    ec='black', 
    lw=0.1, 
    legend=True, 
    cmap='coolwarm',
    vmin=0,
    vmax=vmax, 
    missing_kwds={'color': 'gray'}
)
fig1.savefig("images/corrupt_doctors_by_state.png", dpi=1000)


In [29]:
fig2, ax2 = plt.subplots()
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
ax2.set_title('Total Horizon payout for sampled doctors per million by state, 2019-2020 (USD)')
states.plot(
    column='total_payment_per_million', 
    ax=ax2, 
    ec='black', 
    lw=0.1, 
    legend=True, 
    cmap='Greens', 
    vmin=0,
    vmax=None,
    missing_kwds={'color': 'gray'}
)
fig2.savefig("images/dollars_by_state.png", dpi=1000)

In [30]:
fig3, ax3 = plt.subplots()
ax3.xaxis.set_visible(False)
ax3.yaxis.set_visible(False)
ax3.set_title('Total sampled doctors per million by state')
states.plot(
    column='total_doctors_per_million', 
    ax=ax3, 
    ec='black', 
    lw=0.1, 
    legend=True, 
    cmap='coolwarm', 
    vmin=0,
    vmax=vmax,
    missing_kwds={'color': 'gray'}
)

fig3.savefig("images/doctors_by_state.png", dpi=1000)

In [31]:
fig4, ax4 = plt.subplots()
ax4.xaxis.set_visible(False)
ax4.yaxis.set_visible(False)
ax4.set_title('Total sampled doctors not taking money per million by state')
states.plot(
    column='unpaid_doctors_per_million', 
    ax=ax4, 
    ec='black', 
    lw=0.1, 
    legend=True, 
    cmap='coolwarm',
    vmin=0,
    vmax=vmax,
    missing_kwds={'color': 'gray'}
)

fig4.savefig("images/not_corrupt_doctors_by_state.png", dpi=1000)

In [32]:
states[['count', 'state', 'count_all_doctors']]

Unnamed: 0,count,state,count_all_doctors
0,1.0,Mississippi,5.0
1,14.0,North Carolina,29.0
2,2.0,Oklahoma,6.0
3,6.0,Virginia,10.0
4,4.0,West Virginia,8.0
5,3.0,Louisiana,11.0
6,23.0,Michigan,38.0
7,11.0,Massachusetts,21.0
8,2.0,Idaho,5.0
9,62.0,Florida,102.0


In [33]:
states['count'].sum() / states['count_all_doctors'].sum()

0.5228136882129277