# Sum of Squared Errors in Demographic Inertia Analysis


In [8]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

## Load data from models 



In [2]:
no_growth = pd.read_csv('no_growth_summary02:41PM on August 01, 2019.csv')
linear_growth = pd.read_csv('linear_growth_summary02:41PM on August 01, 2019.csv')
threeyr = pd.read_csv('model_3yr_growth_summary02:41PM on August 01, 2019.csv')
fouryr = pd.read_csv('model_4yr_growth_summary02:41PM on August 01, 2019.csv')



## Collect and Standardize Variables

The variables of interest are the `fpct, deptn, f, m`. So lets find and standardize those. 

1. I have to get these fields from the data
2. Get the standard deviation and mean for each variable
3. Apply the standardization
4. Calculate the sum of squared errors for original values. 

After that I need to apply the same approach on the first differences. 

1. Calculate first differences for the variables. 
2. Get the standard deviation and mean for each variable
3. Apply the standardization
4. Calculate the sum of squared errors for original values. 

### Create nice dataframe to hold data

In [3]:
df = no_growth[['fpct_avg', 'deptn_avg', 'f_avg', 'm_avg']]
model_names = ['lg_', 'three_', 'four_']
for i, dat in enumerate([linear_growth, threeyr, fouryr]):
    dat.rename(columns={'fpct_avg':model_names[i]+'fpct_avg',
                        'deptn_avg': model_names[i]+'deptn_avg',
                        'f_avg': model_names[i]+'f_avg',
                        'm_avg': model_names[i]+'m_avg'},
              inplace=True)
    df = pd.concat([df, dat[[model_names[i]+'fpct_avg', 
                             model_names[i]+'deptn_avg',
                             model_names[i]+'f_avg',
                             model_names[i]+'m_avg']]], 
                             axis=1, sort=False)

df = pd.concat([no_growth[['act_fpct', 'act_deptn', 'act_f', 'act_m']], df], axis=1, sort=False)
df

Unnamed: 0,act_fpct,act_deptn,act_f,act_m,fpct_avg,deptn_avg,f_avg,m_avg,lg_fpct_avg,lg_deptn_avg,lg_f_avg,lg_m_avg,three_fpct_avg,three_deptn_avg,three_f_avg,three_m_avg,four_fpct_avg,four_deptn_avg,four_f_avg,four_m_avg
0,0.108,74.0,8.0,66.0,0.108,74.0,8.0,66.0,0.108,74.0,8.0,66.0,0.108,74.0,8.0,66.0,0.108,74.0,8.0,66.0
1,0.099,71.0,7.0,64.0,0.119,73.74,8.79,64.95,0.118,75.22,8.88,66.34,0.116,74.51,8.63,65.88,0.12,75.69,9.09,66.6
2,0.115,78.0,9.0,69.0,0.127,73.51,9.33,64.18,0.127,76.05,9.66,66.39,0.126,75.59,9.51,66.08,0.129,76.86,9.95,66.91
3,0.128,78.0,10.0,68.0,0.133,73.04,9.72,63.32,0.135,77.05,10.44,66.61,0.135,75.97,10.31,65.66,0.14,77.55,10.82,66.73
4,0.143,84.0,12.0,72.0,0.143,73.45,10.54,62.91,0.143,77.66,11.14,66.52,0.146,76.67,11.22,65.45,0.146,78.74,11.53,67.21
5,0.153,85.0,13.0,72.0,0.149,73.29,10.97,62.32,0.151,77.97,11.82,66.15,0.155,76.66,11.91,64.75,0.155,79.15,12.25,66.9
6,0.146,89.0,13.0,76.0,0.157,73.29,11.54,61.75,0.158,78.11,12.4,65.71,0.16,76.95,12.31,64.64,0.162,79.72,12.88,66.84
7,0.14,86.0,12.0,74.0,0.163,73.21,11.98,61.23,0.169,78.47,13.27,65.2,0.167,76.94,12.84,64.1,0.169,80.11,13.51,66.6
8,0.136,81.0,11.0,70.0,0.167,73.14,12.2,60.94,0.176,78.99,13.95,65.04,0.173,76.9,13.34,63.56,0.178,80.64,14.36,66.28
9,0.169,83.0,14.0,69.0,0.174,73.55,12.77,60.78,0.18,78.6,14.17,64.43,0.18,76.82,13.81,63.01,0.183,80.91,14.78,66.13


### Normalize data

In [25]:
df_actual = df[df.columns[df.columns.str.contains('act')]]
df_ng = df[['fpct_avg', 'deptn_avg', 'f_avg', 'm_avg']]
df_lg = df[df.columns[df.columns.str.contains('lg_')]]
df_three = df[df.columns[df.columns.str.contains('three_')]]
df_four = df[df.columns[df.columns.str.contains('four_')]]

In [35]:
scaler = preprocessing.StandardScaler()
scaler.fit(df_actual.diff())
print(scaler.mean_)
print(scaler.scale_)

[0.00875    1.25       0.91666667 0.33333333]
[0.01677362 3.91843932 1.552328   3.29983165]


In [34]:
df_actual.diff()

Unnamed: 0,act_fpct,act_deptn,act_f,act_m
0,,,,
1,-0.009,-3.0,-1.0,-2.0
2,0.016,7.0,2.0,5.0
3,0.013,0.0,1.0,-1.0
4,0.015,6.0,2.0,4.0
5,0.01,1.0,1.0,0.0
6,-0.007,4.0,0.0,4.0
7,-0.006,-3.0,-1.0,-2.0
8,-0.004,-5.0,-1.0,-4.0
9,0.033,2.0,3.0,-1.0


In [None]:
df