In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
data2014 = pd.read_csv("Output/2014dataset.csv",low_memory=False)
data2015 = pd.read_csv("Output/2015dataset.csv",low_memory=False)
# data2016 = pd.read_csv("Output/2016dataset.csv",low_memory=False) doesn't exist yet

In [3]:
states = data2014["STATE"].drop_duplicates().as_matrix()
print(states)

['AK' 'AL' 'AR' 'AZ' 'CA' 'CO' 'CT' 'DC' 'DE' 'FL' 'GA' 'HI' 'IA' 'ID' 'IL'
 'IN' 'KS' 'KY' 'LA' 'MA' 'MD' 'ME' 'MI' 'MN' 'MO' 'MS' 'MT' 'NC' 'ND' 'NE'
 'NH' 'NJ' 'NM' 'NV' 'NY' 'OH' 'OK' 'OR' 'PA' 'RI' 'SC' 'SD' 'TN' 'TX' 'UT'
 'VA' 'WA' 'WI' 'WV' 'WY']


In [11]:
# weighted rigorous statistics
# group can be i for individual or sg for small group
# absval can be 1 for yes and 0 for no

# functions:
# def getmm(group) - returns member months column
# xbarybar(dataset, column_name, state, group, absval) - returns weighted avg of a column
# sigmaxsigmay(dataset, column_name, state, group, absval) - returns weighted std of a column
# covxy(dataset, column_name1, column_name2, state, group, absval1, absval2) - returns weighted cov of two columns
# covspecial(dataset, column_name1, yi, ybar, state, group, absval1) - returns weighted cov of one column and one state measure
# r(dataset, column_name1, column_name2, state, group, absval1, absval2) - returns weighted r of two columns
# rspecial(dataset, column_name1, yi, ybar, sigmay, state, group, absval1) - returns weighted r of one column and one state measure
# rsquared(dataset, column_name1, column_name2, state, group, absval1, absval2) - returns weighted r^2 of two columns

def getmm(group):
    if group == "i":
        return "7.4Membermonths2HealthInsuranceINDIVIDUALTotalasof3/31/15"
    elif group == "sg":
        return "7.4Membermonths7HealthInsuranceSMALLGROUPTotalasof3/31/15"
    else:
        return ""

def xbarybar(dataset, column_name, state, group, absval):
    if state == "ALL":
        temp = dataset
    else:
        temp = dataset.where(dataset["STATE"] == state).dropna(how='all')
    mm = getmm(group)
    sum_mm = sum(temp[mm].convert_objects(convert_numeric=True).dropna())
    if sum_mm == 0:
        return
    numerator = 0
    denominator = 0
    for index, row in temp.iterrows():
        if (pd.notnull(row[column_name])) and (pd.notnull(row[mm])):
            ratio = row[mm] / sum_mm
            denominator = denominator + ratio
            if absval == 1:
                numerator = numerator + (abs(row[column_name]) * ratio)
            else:
                numerator = numerator + (row[column_name]  * ratio)
    print(str(column_name) + " " + str(numerator / denominator))
    return numerator / denominator

def sigmaxsigmay(dataset, column_name, state, group, absval):
    if state == "ALL":
        temp = dataset
    else:
        temp = dataset.where(dataset["STATE"] == state).dropna(how='all')
    mm = getmm(group)
    sum_mm = sum(temp[mm].convert_objects(convert_numeric=True).dropna())
    if sum_mm == 0:
        return "-1"
    numerator = 0
    denominator = 0
    xbar = xbarybar(dataset, column_name, state, group, absval)
    for index, row in temp.iterrows():
        if (pd.notnull(row[column_name])) and (pd.notnull(row[mm])):
            ratio = row[mm] / sum_mm
            denominator = denominator + ratio
            if absval == 1:
                numerator = numerator + ((abs(row[column_name]) - abs(xbar))**2 * ratio)
            else:
                numerator = numerator + ((row[column_name] - xbar)**2 * ratio)
    return np.sqrt(numerator / denominator)

def covxy(dataset, column_name1, column_name2, state, group, absval1, absval2):
    if state == "ALL":
        temp = dataset
    else:
        temp = dataset.where(dataset["STATE"] == state).dropna(how='all')
    mm = getmm(group)
    sum_mm = sum(temp[mm].convert_objects(convert_numeric=True).dropna())
    if sum_mm == 0:
        return -1
    numerator = 0
    denominator = 0
    xbar = xbarybar(dataset, column_name1, state, group, absval1)
    ybar = xbarybar(dataset, column_name2, state, group, absval2)
    for index, row in temp.iterrows():
        if (pd.notnull(row[column_name2])) and (pd.notnull(row[column_name1])) and (pd.notnull(row[mm])):
            xi = row[column_name1]
            yi = row[column_name2]
            ratio = row[mm] / sum_mm
            denominator = denominator + ratio
            if absval1 == 1 and absval2 == 1:
                numerator = numerator + ((abs(xi) - xbar) * (abs(yi) - ybar) * ratio)
            elif absval1 == 1:
                numerator = numerator + ((abs(xi) - xbar) * (yi - ybar) * ratio)
            elif absval2 == 1:
                numerator = numerator + ((xi - xbar) * (abs(yi) - ybar) * ratio)
            else:
                numerator = numerator + ((xi - xbar) * (yi - ybar) * ratio)
    return numerator / denominator

def r(dataset, column_name1, column_name2, state, group, absval1, absval2):
    print(str(column_name1) + " vs " + str(column_name2) + " in " + str(state))
    cov = covxy(dataset, column_name1, column_name2, state, group, absval1, absval2)
    sigx = sigmaxsigmay(dataset, column_name1, state, group, absval1)
    sigy = sigmaxsigmay(dataset, column_name2, state, group, absval2)
    print("Covariance: " + str(cov))
    print("Std Column 1: " + str(sigx))
    print("Std Column 2: " + str(sigy))
    r = cov / (sigx * sigy)
    print("r: " + str(r) + "\n")
    return r

def rsquared(dataset, column_name1, column_name2, state, group, absval1, absval2):
    return r(dataset, column_name1, column_name2, state, group, absval1, absval2)**2

In [14]:
def play(dataset):
    playground1 = pd.read_csv(dataset,low_memory=False)

    playground1["IndPremiumPMM"] = playground1["1.1Directpremiumwritten2HealthInsuranceINDIVIDUALTotalasof3/31/15"] / playground1["7.4Membermonths2HealthInsuranceINDIVIDUALTotalasof3/31/15"]
    playground1["IndCostsPMM"] = playground1["2.16Totalincurredclaims2HealthInsuranceINDIVIDUALTotalasof3/31/15"] / playground1["7.4Membermonths2HealthInsuranceINDIVIDUALTotalasof3/31/15"]
    playground1["IndRTPMM"] = playground1["HHS RISK ADJUSTMENT TRANSFER AMOUNT (INDIVIDUAL MARKET, INCLUDING CATASTROPHIC)"] / playground1["7.4Membermonths2HealthInsuranceINDIVIDUALTotalasof3/31/15"]
    playground1["IndPremiumMinusCostsPMM"] = playground1["IndPremiumPMM"] - playground1["IndCostsPMM"]
    playground1["IndCostsMinusPremiumsPMM"] = playground1["IndCostsPMM"] - playground1["IndPremiumPMM"]
    
    

    print("Statistics for " + dataset + "\n")
    r(playground1, "IndPremiumPMM","IndRTPMM","ALL", "i", 0, 0)
    r(playground1, "IndCostsPMM","IndRTPMM","ALL", "i", 0, 0)
    r(playground1, "IndCostsPMM","IndPremiumPMM","ALL", "i", 0, 0)
    r(playground1, "IndPremiumMinusCostsPMM","IndRTPMM","ALL", "i", 0, 0)
    r(playground1, "IndCostsMinusPremiumsPMM","IndRTPMM","ALL", "i", 0, 0)
    r(playground1, "IndPremiumPMM","IndRTPMM","CA", "i", 0, 0)
    r(playground1, "IndPremiumPMM","IndRTPMM","WY", "i", 0, 0)

In [15]:
play("Output/2014dataset.csv")
play("Output/2015dataset.csv")

Statistics for Output/2014dataset.csv

IndPremiumPMM vs IndRTPMM in ALL
IndPremiumPMM 309.194208796
IndRTPMM -0.043825530441
IndPremiumPMM 309.194208796
IndRTPMM -0.043825530441
Covariance: 443.286522849
Std Column 1: 65.8300167158
Std Column 2: 36.3064963259
r: 0.185471089025

IndCostsPMM vs IndRTPMM in ALL
IndCostsPMM 323.35040521
IndRTPMM -0.043825530441
IndCostsPMM 323.35040521
IndRTPMM -0.043825530441
Covariance: 1791.1468794
Std Column 1: 107.082466313
Std Column 2: 36.3064963259
r: 0.460710908228

IndCostsPMM vs IndPremiumPMM in ALL
IndCostsPMM 323.35040521
IndPremiumPMM 309.194208796
IndCostsPMM 323.35040521
IndPremiumPMM 309.194208796
Covariance: 3864.76967543
Std Column 1: 107.082466313
Std Column 2: 65.8300167158
r: 0.548253340126

IndPremiumMinusCostsPMM vs IndRTPMM in ALL
IndPremiumMinusCostsPMM -14.0976853903
IndRTPMM -0.043825530441
IndPremiumMinusCostsPMM -14.0976853903
IndRTPMM -0.043825530441
Covariance: -1344.47481847
Std Column 1: 89.7407198802
Std Column 2: 36.3064