In [24]:
# Download and extract the dataset from CMS

import requests # to download the dataset
import zipfile # to extract from archive
import shutil # to write the dataset to file
import os # rename file to something more type-able

url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Information-on-Prescription-Drugs/Downloads/Part_D_All_Drugs_2015.zip'
response = requests.get(url, stream=True)
with open('data/dataset.zip', 'wb') as ds_zipout:
    shutil.copyfileobj(response.raw, ds_zipout)

zip = zipfile.ZipFile('data/dataset.zip', 'r')
ds_filename = zip.namelist()[0]
zip.extract(ds_filename, path="data/")

'data/Medicare_Drug_Spending_PartD_All_Drugs_YTD_2015_12_06_2016.xlsx'

In [25]:
# Read in the raw data file and focus only on the sheet with the data in it
from openpyxl import load_workbook
workbook = load_workbook("data/" + ds_filename)
data_sheet = workbook['Data']

FileNotFoundError: [Errno 2] No such file or directory: 'Medicare_Drug_Spending_PartD_All_Drugs_YTD_2015_12_06_2016.xlsx'

In [None]:
# Extract XLSX data into a pandas DataFrame
import pandas as pd
import numpy as np
df = pd.DataFrame(data_sheet.values)
df.columns = list(df.iloc[3].values)
df.drop(df.index[0:4], inplace=True)
df.index = np.arange(1, len(df) + 1)

df

In [None]:
# Capture only the drug names (we'll need this later)
df_drugnames = df.iloc[:, :2]
df_drugnames.columns = [
    'drugname_brand',
    'drugname_generic'
]

In [None]:
# Serialize drug names to feather file for use in both Python and R
import feather
feather.write_dataframe(df_drugnames, 'data/drugnames.feather')

In [None]:
# Separate column groups by year
cols_by_year = [
    { 'year': 2011, 'start': 2, 'end': 12 },
    { 'year': 2012, 'start': 12, 'end': 22 },
    { 'year': 2013, 'start': 22, 'end': 32 },
    { 'year': 2014, 'start': 32, 'end': 42 },
    { 'year': 2015, 'start': 42, 'end': 53 },
]

df_years = { cols['year']: df.iloc[:, cols['start']:cols['end']] for cols in cols_by_year }

In [None]:
# Remove 2015's extra column for "Annual Change in Average Cost Per Unit" (we can calculate it, anyhow)
df_years[2015] = df_years[2015].drop(df_years[2015].columns[-1], axis=1)

In [None]:
# Make columns easier to type and more generic w.r.t. year
generic_columns = [
    "claim_count",
    "total_spending",
    "user_count",
    "total_spending_per_user",
    "unit_count",
    "unit_cost_wavg",
    "user_count_non_lowincome",
    "out_of_pocket_avg_non_lowincome",
    "user_count_lowincome",
    "out_of_pocket_avg_lowincome"
]

for year in df_years:
    df_years[year].columns = generic_columns

In [None]:
# Cast all column data to appropriate numeric types

# Suppress SettingWithCopyWarnings because I think it's
# tripping on the fact that we have a dict of DataFrames
pd.options.mode.chained_assignment = None
for year in df_years:
    for col in df_years[year].columns:
        df_years[year].loc[:, col] = pd.to_numeric(df_years[year][col])
pd.options.mode.chained_assignment = 'warn'

In [None]:
# Serialize data for each year to feather file for use in both Python and R
for year in df_years:
    feather.write_dataframe(df_years[year], 'data/spending-' + str(year) + '.feather')