In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
data2014 = pd.read_csv("R/2014dataset.csv",low_memory=False)
data2015 = pd.read_csv("R/2015dataset.csv",low_memory=False)
# data2016 = pd.read_csv("Output/2016dataset.csv",low_memory=False) doesn't exist yet

In [4]:
states = data2014["STATE"].drop_duplicates().as_matrix()
print(states)

['AK' 'AL' 'AR' 'AZ' 'CA' 'CO' 'CT' 'DC' 'DE' 'FL' 'GA' 'HI' 'IA' 'ID' 'IL'
 'IN' 'KS' 'KY' 'LA' 'MA' 'MD' 'ME' 'MI' 'MN' 'MO' 'MS' 'MT' 'NC' 'ND' 'NE'
 'NH' 'NJ' 'NM' 'NV' 'NY' 'OH' 'OK' 'OR' 'PA' 'RI' 'SC' 'SD' 'TN' 'TX' 'UT'
 'VA' 'WA' 'WI' 'WV' 'WY']


In [19]:
# weighted rigorous statistics
# group can be i for individual or sg for small group
# absval can be 1 for yes and 0 for no

# functions:
# def getmm(group) - returns member months column
# xbarybar(dataset, column_name, state, group, absval) - returns weighted avg of a column
# sigmaxsigmay(dataset, column_name, state, group, absval) - returns weighted std of a column
# covxy(dataset, column_name1, column_name2, state, group, absval1, absval2) - returns weighted cov of two columns
# covspecial(dataset, column_name1, yi, ybar, state, group, absval1) - returns weighted cov of one column and one state measure
# r(dataset, column_name1, column_name2, state, group, absval1, absval2) - returns weighted r of two columns
# rspecial(dataset, column_name1, yi, ybar, sigmay, state, group, absval1) - returns weighted r of one column and one state measure
# rsquared(dataset, column_name1, column_name2, state, group, absval1, absval2) - returns weighted r^2 of two columns

def getmm(group):
    if group == "i":
        return "7.4Membermonths2HealthInsuranceINDIVIDUALTotalasof3/31/15"
    elif group == "sg":
        return "7.4Membermonths7HealthInsuranceSMALLGROUPTotalasof3/31/15"
    else:
        return ""

def xbarybar(dataset, column_name, state, group, absval):
    if state == "ALL":
        temp = dataset
    else:
        temp = dataset.where(dataset["STATE"] == state).dropna(how='all')
    mm = getmm(group)
    sum_mm = sum(temp[mm].convert_objects(convert_numeric=True).dropna())
    if sum_mm == 0:
        return
    numerator = 0
    denominator = 0
    for index, row in temp.iterrows():
        if (pd.notnull(row[column_name])) and (pd.notnull(row[mm])):
            ratio = row[mm] / sum_mm
            denominator = denominator + ratio
            if absval == 1:
                numerator = numerator + (abs(row[column_name]) * ratio)
            else:
                numerator = numerator + (row[column_name]  * ratio)
    #print(str(column_name) + " " + str(numerator / denominator))
    if denominator == 0:
        return 0
    return numerator / denominator

def sigmaxsigmay(dataset, column_name, state, group, absval):
    if state == "ALL":
        temp = dataset
    else:
        temp = dataset.where(dataset["STATE"] == state).dropna(how='all')
    mm = getmm(group)
    sum_mm = sum(temp[mm].convert_objects(convert_numeric=True).dropna())
    if sum_mm == 0:
        return "-1"
    numerator = 0
    denominator = 0
    xbar = xbarybar(dataset, column_name, state, group, absval)
    for index, row in temp.iterrows():
        if (pd.notnull(row[column_name])) and (pd.notnull(row[mm])):
            ratio = row[mm] / sum_mm
            denominator = denominator + ratio
            if absval == 1:
                numerator = numerator + ((abs(row[column_name]) - abs(xbar))**2 * ratio)
            else:
                numerator = numerator + ((row[column_name] - xbar)**2 * ratio)
    if denominator == 0:
        return 0
    return np.sqrt(numerator / denominator)

def covxy(dataset, column_name1, column_name2, state, group, absval1, absval2):
    if state == "ALL":
        temp = dataset
    else:
        temp = dataset.where(dataset["STATE"] == state).dropna(how='all')
    mm = getmm(group)
    sum_mm = sum(temp[mm].convert_objects(convert_numeric=True).dropna())
    if sum_mm == 0:
        return -1
    numerator = 0
    denominator = 0
    xbar = xbarybar(dataset, column_name1, state, group, absval1)
    ybar = xbarybar(dataset, column_name2, state, group, absval2)
    for index, row in temp.iterrows():
        if (pd.notnull(row[column_name2])) and (pd.notnull(row[column_name1])) and (pd.notnull(row[mm])):
            xi = row[column_name1]
            yi = row[column_name2]
            ratio = row[mm] / sum_mm
            denominator = denominator + ratio
            if absval1 == 1 and absval2 == 1:
                numerator = numerator + ((abs(xi) - xbar) * (abs(yi) - ybar) * ratio)
            elif absval1 == 1:
                numerator = numerator + ((abs(xi) - xbar) * (yi - ybar) * ratio)
            elif absval2 == 1:
                numerator = numerator + ((xi - xbar) * (abs(yi) - ybar) * ratio)
            else:
                numerator = numerator + ((xi - xbar) * (yi - ybar) * ratio)
    if denominator == 0: 
        return 0
    return numerator / denominator

def r(dataset, column_name1, column_name2, state, group, absval1, absval2):
    #print(str(column_name1) + " vs " + str(column_name2) + " in " + str(state))
    cov = covxy(dataset, column_name1, column_name2, state, group, absval1, absval2)
    sigx = sigmaxsigmay(dataset, column_name1, state, group, absval1)
    sigy = sigmaxsigmay(dataset, column_name2, state, group, absval2)
    #print("Covariance: " + str(cov))
    #print("Std Column 1: " + str(sigx))
    #print("Std Column 2: " + str(sigy))
    if sigx == 0 or sigy == 0:
        return 0
    r = cov / (sigx * sigy)
    #print("r: " + str(r) + "\n")
    return r

def rsquared(dataset, column_name1, column_name2, state, group, absval1, absval2):
    return r(dataset, column_name1, column_name2, state, group, absval1, absval2)**2

In [20]:
def play(dataset):
    playground1 = pd.read_csv(dataset,low_memory=False)

    playground1["IndPremiumPMM"] = playground1["1.1Directpremiumwritten2HealthInsuranceINDIVIDUALTotalasof3/31/15"] / playground1["7.4Membermonths2HealthInsuranceINDIVIDUALTotalasof3/31/15"]
    playground1["IndCostsPMM"] = playground1["2.16Totalincurredclaims2HealthInsuranceINDIVIDUALTotalasof3/31/15"] / playground1["7.4Membermonths2HealthInsuranceINDIVIDUALTotalasof3/31/15"]
    playground1["IndRTPMM"] = playground1["HHS RISK ADJUSTMENT TRANSFER AMOUNT (INDIVIDUAL MARKET, INCLUDING CATASTROPHIC)"] / playground1["7.4Membermonths2HealthInsuranceINDIVIDUALTotalasof3/31/15"]
    playground1["IndPremiumMinusCostsPMM"] = playground1["IndPremiumPMM"] - playground1["IndCostsPMM"]
    playground1["IndCostsMinusPremiumsPMM"] = playground1["IndCostsPMM"] - playground1["IndPremiumPMM"]
    
    

    print("Statistics for " + dataset + "\n")
    print("State,PremiumvTransfers,CostvTransfers,PremiumvCosts,ProfitvTransfers")
    for state in states:
        a = r(playground1, "IndPremiumPMM","IndRTPMM", state, "i", 0, 0)
        b = r(playground1, "IndCostsPMM","IndRTPMM", state, "i", 0, 0)
        c = r(playground1, "IndCostsPMM","IndPremiumPMM", state, "i", 0, 0)
        d = r(playground1, "IndPremiumMinusCostsPMM","IndRTPMM", state, "i", 0, 0)
        print(str(state) + "," + str(a) + "," + str(b) + "," + str(c) + "," + str(d))

In [21]:
play("R/2014dataset.csv")
play("R/2015dataset.csv")

Statistics for R/2014dataset.csv

State,PremiumvTransfers,CostvTransfers,PremiumvCosts,ProfitvTransfers
AK,0.509318600792,0.321670267945,0.93933013798,-0.169046673688
AL,0.546133453455,0.556175505253,0.380714077291,-0.271500818803
AR,-0.633463828589,-0.352009112589,0.789956542213,-0.661041471126
AZ,0.323429040553,0.237610820762,0.592001861328,-0.154570667134
CA,0.43044220963,0.883771319081,0.274407351155,-0.770458764132
CO,0.225654421823,0.55876016671,0.901936836622,-0.751022182157
CT,-0.191444787769,0.00653347185629,0.854177111482,-0.367676414989
DC,0.218425905672,0.0152327611838,-0.287300224696,-0.00511161285349
DE,-0.743201704422,-0.206272698827,0.585838940677,-0.234649599583
FL,0.478991433212,0.652460080036,0.90869073734,-0.706676323752
GA,0.194772482065,0.47581235883,0.632964423169,-0.46257973571
HI,1.0,1.0,1.0,-1.0
IA,0.893613921466,0.631609466434,0.835346042396,-0.48133372104
ID,0.0904917116848,0.493286637835,0.877815142121,-0.699785537833
IL,-0.337686592836,-0.104733501779,0.76