In [1]:
import pandas as pd
import numpy as np
from itertools import repeat
from statistics import median
from scipy.stats import kurtosis
from scipy.stats import skew

In [2]:
good_state_ids = ['NY','FL','CA','TX']

In [3]:
df_initial = pd.read_csv('census_data.csv')

In [4]:
df = df_initial.copy()
df['geo_id'] = df['geo_id'].astype(str)
geo_map = {'36':'NY',
           '12':'FL',
           '6':'CA',
           '48':'TX'
          }
df['geo'] = df['geo_id'].apply(lambda x: geo_map[x] if x in geo_map else x)

In [5]:
#mean by state
state_mean_df = df[['geo','income_per_capita']]
state_mean_df = state_mean_df[state_mean_df['geo'].isin(good_state_ids)]
state_mean_dict = dict(zip(state_mean_df.geo, state_mean_df.income_per_capita))

In [6]:
#mean entire country
total_mean = sum(df['total_pop'] * df['income_per_capita']) / df['total_pop'].sum()

In [7]:
keep_cols = ['geo','geo_id','total_pop']
income_cols = []
for i in df.columns:
    if 'income' in i:
        keep_cols.append(i)
        if '00' in i:
            income_cols.append(i)

In [8]:
df = df[keep_cols]

In [9]:
all_incomes = []
all_geos = []
by_state_dict = {}

#get a list of all incomes in data set (all_incomes)  and get that by state (by_state_dict)
for temp_id in df['geo'].unique():
    state_vals = []
    temp = df.copy()
    temp = temp[temp['geo']==temp_id]
    for income_col in income_cols:
        val = income_col.split('_')[-1]
        if val == 'more':
            val = 250000
        val = int(val)
        
        count = temp[income_col].min()
        all_incomes.extend(repeat(val, count))
        state_vals.extend(repeat(val, count))
        all_geos.extend(repeat(temp_id,count))
    by_state_dict[temp_id] = state_vals
#         income_lev_df = pd.DataFrame({'geo':[temp_id],
#                                       'income':[val]
#                                      })
        
#         income_lev_df = income_lev_df.append([income_lev_df]*(count-1),ignore_index=True)
#         final_df = final_df.append([income_lev_df])

In [10]:
# all_df = pd.DataFrame({'geo':all_geos,
#                        'income':all_incomes,
#                       })    

In [11]:
def get_median_skew(income_list):
    #get median from list
    med_val = int(median(income_list))
    
    #get # items in list
    num_vals = len(income_list)
    
    #get median index
    median_loc = (num_vals + 1)/2
    
    #get number items below median val
    num_below_med = len([i for i in income_list if i < med_val])
    
    #get number of values = to median
    num_at_med = len([i for i in income_list if i == med_val])
    
    #calculate how many items into 
    num_into_med = (median_loc - num_below_med) / num_at_med
    
    #calculate % into median range
    range_start = int(str(med_val).replace('999','') + '000')
    final_med = range_start + ((med_val-range_start)*num_into_med)
    
    #get skew
    skew_val = skew(income_list)
    
    return {'Skew':skew_val,
            'Median':final_med
           }

In [12]:
#country median/skew
total_med_skew_mean = get_median_skew(all_incomes)
total_med_skew_mean['mean'] = total_mean

In [13]:
total_med_skew_mean

{'Skew': 1.0459226727308006,
 'Median': 59977.40914478211,
 'mean': 32411.262103872632}

In [14]:
#get median/skew for all states individually
state_med_skew_mean_dict = {}
for state,incomes in by_state_dict.items():
    if state in good_state_ids:
        print(state)
        #print(len(incomes))
        state_med_skew_mean_dict[state] = get_median_skew(incomes)

TX
FL
NY
CA


In [15]:
for state,mean_val in state_mean_dict.items():
    state_med_skew_mean_dict[state]['Mean'] = mean_val
    
state_med_skew_mean_dict['US'] = total_med_skew_mean

In [16]:
state_med_skew_mean_dict

{'TX': {'Skew': 1.0549667228103075, 'Median': 59961.3843633655, 'Mean': 30143},
 'FL': {'Skew': 1.262708304934096, 'Median': 59354.80037009153, 'Mean': 30197},
 'NY': {'Skew': 0.8521439351930625, 'Median': 74386.1299736343, 'Mean': 37470},
 'CA': {'Skew': 0.7608806099964737,
  'Median': 74756.55312725568,
  'Mean': 35021},
 'US': {'Skew': 1.0459226727308006,
  'Median': 59977.40914478211,
  'mean': 32411.262103872632}}

In [17]:
print('test print')

test print


Ignore Below Here

In [40]:
# med_val = median(all_incomes)
# med_val

In [41]:
# num_vals = len(all_incomes)
# num_vals

In [42]:
# median_loc = (num_vals + 1)/2
# median_loc

In [43]:
# num_below_med = len([i for i in all_incomes if i < med_val])
# num_below_med

In [44]:
# (median_loc - num_below_med)

In [45]:
# num_at_med = len([i for i in all_incomes if i == med_val])
# num_at_med

In [46]:
# num_into_med = (median_loc - num_below_med) / num_at_med
# num_into_med

In [47]:
# 55000 + ((59999-55000)*num_into_med)

In [48]:
# pd.Series(all_incomes).value_counts()

In [49]:
# set(all_incomes)

In [50]:
# skew(all_incomes)