#### This Notebook will <li> import the 2016 population estimates <li> import the state table to join the abbreviations to state name <li> calculate the percentage of the US population in each state and DC <li> read in the stored data of grouped submissions by state <li> merge the datasets and calculate the difference from the percentage of submissions by that state

To do the EDA of the source of comments we import the latest estimated census data, from 2016 

In [1]:
## import packages
import pandas as pd
import numpy as np

Calculate % of US Population in each state & DC from 2016 Census Estimates.

In [2]:
#read in the 2016 population estimates
pop_df = pd.read_csv('sub-est2016_all.csv', encoding='latin-1')

In [3]:
#explore
pop_df.head()

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016
0,40,1,0,0,0,0,0,A,Alabama,Alabama,4779736,4780131,4785492,4799918,4815960,4829479,4843214,4853875,4863300
1,162,1,0,124,0,0,0,A,Abbeville city,Alabama,2688,2688,2683,2685,2647,2631,2619,2616,2603
2,162,1,0,460,0,0,0,A,Adamsville city,Alabama,4522,4522,4517,4495,4472,4447,4428,4395,4360
3,162,1,0,484,0,0,0,A,Addison town,Alabama,758,756,754,753,748,748,747,740,738
4,162,1,0,676,0,0,0,A,Akron town,Alabama,356,356,355,345,345,342,337,337,334


In [4]:
#select only the columns we need
pop_df = pop_df[['POPESTIMATE2016',"NAME","STNAME"]]

In [5]:
#trim down rows so that we have excluded county population estimates
pop_df = pop_df[pop_df.STNAME==pop_df.NAME]

In [6]:
#explore trimed rows - ! notice DC is both a county and state and is duplicated
pop_df.head(10)

Unnamed: 0,POPESTIMATE2016,NAME,STNAME
0,4863300,Alabama,Alabama
1105,741894,Alaska,Alaska
1452,6931071,Arizona,Arizona
1673,2988248,Arkansas,Arkansas
2848,39250017,California,California
3925,5540545,Colorado,Colorado
4616,3576452,Connecticut,Connecticut
4901,952065,Delaware,Delaware
5025,681170,District of Columbia,District of Columbia
5027,681170,District of Columbia,District of Columbia


In [7]:
# drop duplicate rows of DC
pop = pop_df.drop_duplicates(subset=['NAME','STNAME'])

In [31]:
#explore
pop.head(9)

Unnamed: 0,POPESTIMATE2016,NAME,STNAME,perc_pop
0,4863300,Alabama,Alabama,1.51
1105,741894,Alaska,Alaska,0.23
1452,6931071,Arizona,Arizona,2.14
1673,2988248,Arkansas,Arkansas,0.92
2848,39250017,California,California,12.15
3925,5540545,Colorado,Colorado,1.71
4616,3576452,Connecticut,Connecticut,1.11
4901,952065,Delaware,Delaware,0.29
5025,681170,District of Columbia,District of Columbia,0.21


Read in table of State abbreviations.

In [10]:
#read in state abbreviation names
states_df = pd.read_csv('state_table.csv')

In [48]:
#explore
states_df

Unnamed: 0,name,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA
5,Colorado,CO
6,Connecticut,CT
7,Delaware,DE
8,Florida,FL
9,Georgia,GA


In [12]:
# reduce to state name and abbreviation
states_df = states_df[['name','abbreviation']]

Do the calculations.

In [13]:
# calculate % of pop in eact state and round to nearest 2 decimal places
pop['perc_pop'] = ((pop['POPESTIMATE2016']/pop['POPESTIMATE2016'].sum())*100).round(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [53]:
pop.head(9)

Unnamed: 0,POPESTIMATE2016,NAME,STNAME,perc_pop
0,4863300,Alabama,Alabama,1.51
1105,741894,Alaska,Alaska,0.23
1452,6931071,Arizona,Arizona,2.14
1673,2988248,Arkansas,Arkansas,0.92
2848,39250017,California,California,12.15
3925,5540545,Colorado,Colorado,1.71
4616,3576452,Connecticut,Connecticut,1.11
4901,952065,Delaware,Delaware,0.29
5025,681170,Washington DC,District of Columbia,0.21


Replace District of Columbia with Washington DC as NAME, STNAME.

In [54]:
#replacing the contents of a cell across a column
pop['STNAME'] = pop['STNAME'].replace({"District of Columbia":"Washington DC" })

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [55]:
#merge 2 dataframes on state name
pop_perc = pd.merge(pop, states_df, left_on='NAME',right_on='name', how='left')

In [57]:
pop_perc.head(9)

Unnamed: 0,POPESTIMATE2016,NAME,STNAME,perc_pop,name,abbreviation
0,4863300,Alabama,Alabama,1.51,Alabama,AL
1,741894,Alaska,Alaska,0.23,Alaska,AK
2,6931071,Arizona,Arizona,2.14,Arizona,AZ
3,2988248,Arkansas,Arkansas,0.92,Arkansas,AR
4,39250017,California,California,12.15,California,CA
5,5540545,Colorado,Colorado,1.71,Colorado,CO
6,3576452,Connecticut,Connecticut,1.11,Connecticut,CT
7,952065,Delaware,Delaware,0.29,Delaware,DE
8,681170,Washington DC,Washington DC,0.21,Washington DC,DC


In [58]:
# delete unwanted columns
del pop_perc['NAME']
del pop_perc['STNAME']

In [59]:
#save as CSV file
pop_perc.to_csv('FCC_pop_perc.csv',index=False)

Calculate percentage of comments from each state and DC

In [21]:
# >> Stored 'data' (str)
%store -r df

In [22]:
df.head()

Unnamed: 0,id,date_submission,contact_email,text_data,filers,address_line_1,city,express_comment,state,intl_address,combined_zip,flagged_zip,Place_Name,State_Abbreviation
0,59f8b2bb50b733300f069dc9,2017-07-11T15:53:55.361Z,YaroslavKalinina@pornhub.com,Allowing broadband providers to throttle their...,"[{""name"":""Yaroslav Kalinina""}]",,,,,"{""addresstext"":""3575 Eagle Lane\n55792,Virgini...",55792,1,Virginia,MN
1,59f8b2bb50b733300f069dca,2017-07-11T15:53:55.390Z,GalinaMedvedeva@pornhub.com,We need net neutralityto continue. A free and ...,"[{""name"":""Galina Medvedeva""}]",,,,,"{""addresstext"":""2245 Lodgeville Road\n55415,Mi...",55415,1,Minneapolis,MN
2,59f8b2bb50b733300f069dcb,2017-07-11T15:53:55.449Z,PhilemonButusov@pornhub.com,make sure net neutrality does not dissapear. I...,"[{""name"":""Philemon Butusov""}]",,,,,"{""addresstext"":""4702 Ashmor Drive\n55802,Dulut...",55802,1,Duluth,MN
3,59f8b2bb50b733300f069dcc,2017-07-11T15:53:55.473Z,BarbaraUlyanova@pornhub.com,We need net neutralityto continue. A free and ...,"[{""name"":""Barbara Ulyanova""}]",,,,,"{""addresstext"":""2323 Lodgeville Road\n55406,Mi...",55406,1,Minneapolis,MN
4,59f8b2bb50b733300f069dcd,2017-07-11T15:53:55.512Z,MatildaZhdanova@pornhub.com,Please save the internet from the corporations...,"[{""name"":""Matilda Zhdanova""}]",,,,,"{""addresstext"":""2135 Lena Lane\n39401,Hattiesb...",39401,1,Hattiesburg,MS


In [23]:
g1 = df.groupby( [ "State_Abbreviation"] ).count()

In [24]:
g1.reset_index() # note territories and 

Unnamed: 0,State_Abbreviation,id,date_submission,contact_email,text_data,filers,address_line_1,city,express_comment,state,intl_address,combined_zip,flagged_zip,Place_Name
0,AK,4441,4441,2743,4434,4441,4411,4412,40,4413,2948,4441,4441,4441
1,AL,29970,29970,22519,29961,29970,29903,29910,109,29910,22473,29970,29970,29970
2,AP,3,3,2,3,3,3,3,0,3,2,3,3,3
3,AR,15605,15605,12221,15599,15605,15561,15562,64,15562,10785,15605,15605,15605
4,AZ,36573,36573,23569,36537,36573,36298,36321,576,36321,23448,36573,36573,36573
5,CA,216119,216119,129713,215879,216119,213781,214037,2010,214034,137250,216119,216119,216119
6,CO,37914,37914,23635,37872,37914,37578,37619,349,37619,23848,37914,37914,37914
7,CT,24834,24834,10725,24816,24834,24649,24679,204,24679,18250,24834,24834,24834
8,DC,4962,4962,3366,4863,4962,4892,4901,27,4901,2750,4962,4962,4962
9,DE,4723,4723,3208,4719,4723,4686,4691,48,4691,3328,4723,4723,4723


In [26]:
g1.reset_index(level=0, inplace=True)

In [27]:
g2 = g1[["State_Abbreviation","filers"]]

In [28]:
g2.head()

Unnamed: 0,State_Abbreviation,filers
0,AK,4441
1,AL,29970
2,AP,3
3,AR,15605
4,AZ,36573


In [29]:
# calculate percentage proportions
g2['perc_comments'] = ((g2['filers']/g2['filers'].sum())*100).round(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [47]:
g2.head(9)

Unnamed: 0,State_Abbreviation,filers,perc_comments
0,AK,4441,0.23
1,AL,29970,1.53
2,AP,3,0.0
3,AR,15605,0.8
4,AZ,36573,1.87
5,CA,216119,11.06
6,CO,37914,1.94
7,CT,24834,1.27
8,DC,4962,0.25


Merge population percentage with FCC comments percentage

In [60]:
#merge 2 dataframes on state name
pop_perc = pd.merge(pop_perc, g2, left_on='abbreviation',right_on='State_Abbreviation', how='left')

In [62]:
pop_perc.head(9)

Unnamed: 0,POPESTIMATE2016,perc_pop,name,abbreviation,State_Abbreviation,filers,perc_comments
0,4863300,1.51,Alabama,AL,AL,29970,1.53
1,741894,0.23,Alaska,AK,AK,4441,0.23
2,6931071,2.14,Arizona,AZ,AZ,36573,1.87
3,2988248,0.92,Arkansas,AR,AR,15605,0.8
4,39250017,12.15,California,CA,CA,216119,11.06
5,5540545,1.71,Colorado,CO,CO,37914,1.94
6,3576452,1.11,Connecticut,CT,CT,24834,1.27
7,952065,0.29,Delaware,DE,DE,4723,0.24
8,681170,0.21,Washington DC,DC,DC,4962,0.25


Calculate Difference between comments and population proportions.

In [63]:
pop_perc.to_csv('FCC_pop_comments_perc.csv')

In [64]:
FCC_pop_comments_perc= pop_perc

In [70]:
FCC_pop_comments_perc['diff_perc_pop_comments'] = FCC_pop_comments_perc['perc_pop']-FCC_pop_comments_perc['perc_comments']

In [71]:
FCC_pop_comments_perc.head(10) 

Unnamed: 0,POPESTIMATE2016,perc_pop,name,abbreviation,State_Abbreviation,filers,perc_comments,diff_prop_pop_prop_com,diff_perc_pop_comments
0,4863300,1.51,Alabama,AL,AL,29970,1.53,-0.02,-0.02
1,741894,0.23,Alaska,AK,AK,4441,0.23,0.0,0.0
2,6931071,2.14,Arizona,AZ,AZ,36573,1.87,0.27,0.27
3,2988248,0.92,Arkansas,AR,AR,15605,0.8,0.12,0.12
4,39250017,12.15,California,CA,CA,216119,11.06,1.09,1.09
5,5540545,1.71,Colorado,CO,CO,37914,1.94,-0.23,-0.23
6,3576452,1.11,Connecticut,CT,CT,24834,1.27,-0.16,-0.16
7,952065,0.29,Delaware,DE,DE,4723,0.24,0.05,0.05
8,681170,0.21,Washington DC,DC,DC,4962,0.25,-0.04,-0.04
9,20612439,6.38,Florida,FL,FL,125912,6.44,-0.06,-0.06


In [72]:
#CHANGED TO PERC2
FCC_pop_comments_perc.to_csv('FCC_pop_comments_perc2.csv',index=False)