# Compare different model outputs against empirical metrics of US MSA income distributions


Descriptive Characteristics of Empirical Distributions, against which to compare models:
- general shape
- decile scaling
- growth of tail in relative distributions (better described by next one...)
- moments vs. population (especially 3rd)
    - NOTE: What about 1st moment? effectively, what about actual scaling behavior across size that won't show up?



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import sklearn.metrics as sk
import scipy.stats
from scipy.optimize import curve_fit
#from RegscorePy import *
from IPython.core.debugger import set_trace
import seaborn as sns
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

# Input from Data & Model Output

# Decile Scaling

In [None]:
# maps int (1-10) to pair (list of income_pp,list of city size)
deciles = {}
for i in range(1,11):
    deciles[i] = ([],[])

for msa in income_lists:
    if str(msa) in metro_ids:
        income_list = income_lists[msa]
        pop_list = fullpops[msa]
        city_pop = city_pops[msa]

        #split into deciles
        sum_pop = sum(pop_list)
        pop_cutoffs = [sum_pop*float(i)/10. for i in range(1,11)]
        cumulative_pop = 0.
        current_decile = 1
        # summed population and income for current decile
        decile_pop = 0.
        decile_inc = 0.
        # sort by income
        income_list,pop_list = (list(t) for t in zip(*sorted(zip(income_list, pop_list))))
        for ctpop, ctinc in zip(pop_list,income_list):
            cumulative_pop += ctpop
            decile_pop += ctpop
            decile_inc += ctinc*ctpop
            if cumulative_pop >= pop_cutoffs[(current_decile - 1)]:
    #             if cumulative_pop == pop_cutoffs[9]:
    #                 print("finish reached")
                current_decile += 1
                #store in deciles
                correction = (sum_pop/10.)/(decile_pop)
                decile_inc *= correction 
                #print(correction)
                #print("Decile:",decile_pop,"full/10:",(sum_pop/10.),"with correction",decile_pop*correction)
                
                deciles[(current_decile-1)][0].append(decile_inc)
                deciles[(current_decile-1)][1].append(city_pop)
                #reset
                decile_pop = 0
                decile_inc = 0

        sum_inc = np.sum(np.array(income_list)*np.array(pop))

        
def standardModelFunc(x,y0,delta):
    return y0*(x**(1+delta))
def standardLogResidual(params, x, true_data):
    model = standardModelFunc(x,params['y0'],params['delta'])
    return np.log(true_data)-np.log(model)
        
def standardLogMod(x,logy0,delta):
    return logy0+x*(1+delta)
    
plt.figure()
ax = plt.gca()
decileFits = {}
decileModels = {}
decileParams = {}
for i in range(1,11):
    x= list(deciles[i][1])
    y = list(deciles[i][0])
    #print("decile:",i,"x,y = ",x,y)
    plt.scatter(x,y,s=2)
    decileParams[i] = lmfit.Parameters()
    decileParams[i].add_many(('y0',8*list(y)[0]/list(x)[0],True,0,1),('delta',0.,True))
    

    decileModels[i] = lmfit.Model(standardModelFunc)
    decileFits[i] = lmfit.minimize(standardLogResidual, decileParams[i], args=(x, y))
    decileParams[i] = decileFits[i].params
    
    #using curve_fit
    logx = np.log(x)
    logy = np.log(y)
    coeff, var = curve_fit(standardLogMod, logx, logy)
    
    lbl = "decile "+str(i)+", Beta = "+str(round((1.+coeff[1]),3))
    print(coeff)
    fity = np.exp(np.array(coeff[0],dtype=np.float128))*x**(1+coeff[1])
    #print(x,fity)
    ax.plot(x, fity,'--',label=lbl,markersize=1)
    #ax.plot(x,decileModels[i].eval(decileParams[i],x=x),'--',label=lbl)
    

ax.set_yscale("log")
ax.set_xscale("log")
plt.legend()


In [None]:
#