In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
data2014 = pd.read_csv("Input/2014dataset.csv",low_memory=False)
data2015 = pd.read_csv("Input/2015dataset.csv",low_memory=False)
data2016 = pd.read_csv("Input/2016dataset.csv",low_memory=False)

In [3]:
states = data2014["STATE"].drop_duplicates().as_matrix()
print(states)

['AK' 'AL' 'AR' 'AZ' 'CA' 'CO' 'CT' 'DC' 'DE' 'FL' 'GA' 'HI' 'IA' 'ID' 'IL'
 'IN' 'KS' 'KY' 'LA' 'MA' 'MD' 'ME' 'MI' 'MN' 'MO' 'MS' 'MT' 'NC' 'ND' 'NE'
 'NH' 'NJ' 'NM' 'NV' 'NY' 'OH' 'OK' 'OR' 'PA' 'RI' 'SC' 'SD' 'TN' 'TX' 'UT'
 'VA' 'WA' 'WI' 'WV' 'WY']


In [1]:
# weighted rigorous statistics
# group can be i for individual or sg for small group
# absval can be 1 for yes and 0 for no

# functions:
# def getmm(group) - returns member months column
# xbarybar(dataset, column_name, state, group, absval) - returns weighted avg of a column
# sigmaxsigmay(dataset, column_name, state, group, absval) - returns weighted std of a column
# covxy(dataset, column_name1, column_name2, state, group, absval1, absval2) - returns weighted cov of two columns
# covspecial(dataset, column_name1, yi, ybar, state, group, absval1) - returns weighted cov of one column and one state measure
# r(dataset, column_name1, column_name2, state, group, absval1, absval2) - returns weighted r of two columns
# rspecial(dataset, column_name1, yi, ybar, sigmay, state, group, absval1) - returns weighted r of one column and one state measure
# rsquared(dataset, column_name1, column_name2, state, group, absval1, absval2) - returns weighted r^2 of two columns

def getmm(group):
    if group == "i":
        return "7.4Membermonths2HealthInsuranceINDIVIDUALTotalasof3/31/15"
    elif group == "sg":
        return "7.4Membermonths7HealthInsuranceSMALLGROUPTotalasof3/31/15"
    else:
        return ""

def unweightedr(dataset, column_name1, column_name2):
    if column_name1 == column_name2: # when the two columns are the same...
        dataset = dataset[[column_name1]].dropna(how='any')
        n = len(dataset.index)
        sum_b = sum(dataset[column_name1])
        avg_b = sum_b / n
    else: # when the two columns are different
        dataset = dataset[[column_name1, column_name2]].dropna(how='any')
        n = len(dataset.index)
        sum_b = sum(dataset[column_name2])
        avg_b = sum_b / n
    sum_a = sum(dataset[column_name1])
    avg_a = sum_a / n
    print("Average a: " + str(avg_a))
    print("Average b: " + str(avg_b))
    num = 0
    var_a = 0
    var_b = 0
    for index, row in dataset.iterrows():
        a_i = row[0]
        b_i = row[0]
        if column_name1 != column_name2:
            b_i = row[1]
        num = num + (a_i - avg_a)*(b_i - avg_b)
        var_a = var_a + (a_i - avg_a)**2
        var_b = var_b + (b_i - avg_b)**2   
        index = index + 1
    std_a = np.sqrt(var_a)
    std_b = np.sqrt(var_b)
    denom = std_a * std_b
    print("r for " + str(column_name1) + " vs " + str(column_name2) + " = " + str(num/denom) + " with n = " + str(n))
    return num / denom

In [2]:
def play(dataset):
    playground1 = pd.read_csv(dataset,low_memory=False)

    playground1["IndPremiumPMM"] = playground1["1.1Directpremiumwritten2HealthInsuranceINDIVIDUALTotalasof3/31/15"] / playground1["7.4Membermonths2HealthInsuranceINDIVIDUALTotalasof3/31/15"]
    playground1["IndCostsPMM"] = playground1["2.16Totalincurredclaims2HealthInsuranceINDIVIDUALTotalasof3/31/15"] / playground1["7.4Membermonths2HealthInsuranceINDIVIDUALTotalasof3/31/15"]
    playground1["IndRTPMM"] = playground1["HHS RISK ADJUSTMENT TRANSFER AMOUNT (INDIVIDUAL MARKET, INCLUDING CATASTROPHIC)"] / playground1["7.4Membermonths2HealthInsuranceINDIVIDUALTotalasof3/31/15"]
    playground1["IndPremiumMinusCostsPMM"] = playground1["IndPremiumPMM"] - playground1["IndCostsPMM"]
    playground1["IndCostsMinusPremiumsPMM"] = playground1["IndCostsPMM"] - playground1["IndPremiumPMM"]
    
    

    print("Statistics for " + dataset)
    unweightedr(playground1, "IndPremiumPMM","IndRTPMM")
    unweightedr(playground1, "IndCostsPMM","IndRTPMM")
    unweightedr(playground1, "IndCostsPMM","IndPremiumPMM")
    unweightedr(playground1, "IndPremiumMinusCostsPMM","IndRTPMM")
    unweightedr(playground1, "IndPremiumMinusCostsPMM","IndPremiumMinusCostsPMM")
    print("\n")

In [3]:
play("Input/2014dataset.csv")
play("Input/2015dataset.csv")
play("Input/2016dataset.csv")

NameError: name 'pd' is not defined