In [154]:
%run -i globals.ipynb

<h2>Rapid Python analysis to Google Sheets for reporters</h2>
<h3>Example using Texas Census data</h3>
<p>In this example, we will be doing simple analysis on the fastest growing counties in Texas, according to U.S. Census data. The goal is to find the following:</p>

<ul>
    <li>Find how much the state of Texas has grown over the last decade</li>
    <li>The county that has grown the most in percentage and in raw number</li>
    <li>How many counties have lost population</li>
    <li>The top 10 fastest growing counties in Texas (with a population of 10,000 residents)</li>

</ul>

In [155]:
## Variables

filename1 = 'raw/ACSDT5Y2016.B01003-Data.csv'

filename2 = 'raw/ACSDT5Y2021.B01003-Data.csv'

outputfile = 'output/top_10_county_Texas_change'

## Google Sheet Name

google_sheet_name = "Texas Census analysis"

## Google Sheet Directory ID

google_folderID = '1-UA5NfR9cYuNBDQfIE-6ZRTzKDd7KRgE'



<h3>Analysis of data</h3>
<h4>Merge the data</h4>

In [156]:
df1 = pd.read_csv('raw/ACSDT5Y2016.B01003-Data.csv', skiprows=1, dtype={'Estimate!!Total': int})

df2 = pd.read_csv('raw/ACSDT5Y2021.B01003-Data.csv', skiprows=1, dtype={'Estimate!!Total': int})

In [157]:
df_merge = pd.merge(df1, df2, how="outer", on=["Geography"])

In [158]:
df_merge.head(5)

Unnamed: 0,Geography,Geographic Area Name_x,Estimate!!Total_x,Annotation of Estimate!!Total_x,Margin of Error!!Total_x,Annotation of Margin of Error!!Total_x,Geographic Area Name_y,Estimate!!Total_y,Annotation of Estimate!!Total_y,Margin of Error!!Total_y,Annotation of Margin of Error!!Total_y
0,0500000US48001,"Anderson County, Texas",57772,,*****,*****,"Anderson County, Texas",58133,,*****,*****
1,0500000US48003,"Andrews County, Texas",17215,,*****,*****,"Andrews County, Texas",18184,,*****,*****
2,0500000US48005,"Angelina County, Texas",87657,,*****,*****,"Angelina County, Texas",86584,,*****,*****
3,0500000US48007,"Aransas County, Texas",24729,,*****,*****,"Aransas County, Texas",24149,,*****,*****
4,0500000US48009,"Archer County, Texas",8750,,*****,*****,"Archer County, Texas",8616,,*****,*****


<h4>Find population changes, in raw number and percentages </h4>

In [159]:
df_merge['pop_change'] = df_merge['Estimate!!Total_y'] - df_merge['Estimate!!Total_x']

In [160]:
df_merge['pop_change_pct'] = ((df_merge['Estimate!!Total_y'] - df_merge['Estimate!!Total_x'])/df_merge['Estimate!!Total_x']) * 100



<h4>Take out unnecessary columns, clean up column names and county names</h4>

In [161]:
df_merge_short = df_merge[['Geographic Area Name_x', 'Estimate!!Total_x', 'Estimate!!Total_y', 'pop_change', 'pop_change_pct']]

In [162]:
df_merge_short.head(5)

Unnamed: 0,Geographic Area Name_x,Estimate!!Total_x,Estimate!!Total_y,pop_change,pop_change_pct
0,"Anderson County, Texas",57772,58133,361,0.62487
1,"Andrews County, Texas",17215,18184,969,5.628812
2,"Angelina County, Texas",87657,86584,-1073,-1.224089
3,"Aransas County, Texas",24729,24149,-580,-2.345424
4,"Archer County, Texas",8750,8616,-134,-1.531429


In [163]:
df_merge_short = df_merge[['Geographic Area Name_x', 'Estimate!!Total_x', 'Estimate!!Total_y', 'pop_change', 'pop_change_pct']]
df_merge_short.rename(columns={'Geographic Area Name_x': 'county', 'Estimate!!Total_x': 'pop_2016', 'Estimate!!Total_y': 'pop_2021'}, inplace=True)
df_merge_short['county'] = df_merge_short['county'].str.replace(" County, Texas", "")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


<h4>Take out counties with less than 10,000 residents and find the top 10 fastest growing counties</h4>

In [164]:
no_small_counties = df_merge_short[df_merge_short['pop_2021'] > 10000].sort_values(by=['pop_change_pct'], ascending=False)

In [165]:
no_small_counties

Unnamed: 0,county,pop_2016,pop_2021,pop_change,pop_change_pct
104,Hays,185686,234573,48887,26.327779
45,Comal,124234,156257,32023,25.776358
128,Kaufman,111830,140145,28315,25.319682
10,Bastrop,78286,94887,16601,21.205580
245,Williamson,490619,591759,101140,20.614774
...,...,...,...,...,...
57,Dawson,13317,12371,-946,-7.103702
109,Hockley,23377,21670,-1707,-7.302049
231,Uvalde,27055,24918,-2137,-7.898725
175,Newton,14138,12532,-1606,-11.359457


In [166]:
no_small_counties.head(10).to_csv('output/top_10_county_Texas_change.csv', index=False)

<h4>Total Texas population changes</h4>

In [167]:
TX_population_2016 = df_merge_short['pop_2016'].sum()
TX_population_2021 = df_merge_short['pop_2021'].sum()
TX_population_change = TX_population_2021 - TX_population_2016
TX_pop_pct_change = ((TX_population_2021 - TX_population_2016)/TX_population_2016)*100

In [168]:
TX_population_change

1906146

In [169]:
TX_pop_pct_change

7.07120952752098

<h4>Texas counties with the bigget population changes</h4>

In [170]:
county_most_pct_change = df_merge_short.sort_values(by=['pop_change_pct'], ascending=False).head(1)
county_most_num_change = df_merge_short.sort_values(by=['pop_change'], ascending=False).head(1)

In [171]:
county_most_num_change

Unnamed: 0,county,pop_2016,pop_2021,pop_change,pop_change_pct
100,Harris,4434257,4697957,263700,5.946881


In [172]:
county_most_num_change['county'].iat[0]

'Harris'

<h4>Texas counties that lost population</h4>

In [173]:
lost_population = df_merge_short[df_merge_short['pop_change'] < 0].sort_values(by=['pop_change'], ascending=False)

In [174]:
lost_population

Unnamed: 0,county,pop_2016,pop_2021,pop_change,pop_change_pct
96,Hamilton,8232,8211,-21,-0.255102
237,Ward,11396,11375,-21,-0.184275
86,Glasscock,1253,1221,-32,-2.553871
131,Kent,667,632,-35,-5.247376
134,King,274,229,-45,-16.423358
...,...,...,...,...,...
253,Zavala,12107,9900,-2207,-18.229124
120,Jasper,35640,33369,-2271,-6.372054
124,Jim Wells,41486,39203,-2283,-5.503061
242,Wichita,132148,129419,-2729,-2.065109


In [175]:
number_lost_population = len(lost_population.index)
number_lost_population

142

<h3>Putting aggregate totals in a spreadsheet</h3>
<p>This creates a spreadsheet of all of the topline stats so we have them in one place.</p>

In [176]:
# This is where we put the fields that we have to include
agg_columns = ['texas_growth', 'texas_growth_pct_change', 'county_most_pct_change',
               'county_most_num_change', 'number_lost_population']

In [177]:
agg_data = [comma_format(TX_population_change), pct_format(TX_pop_pct_change),county_most_pct_change['county'].iat[0], county_most_num_change['county'].iat[0], number_lost_population]


In [178]:
data_to_add = {'Criteria': agg_columns, 'Totals': agg_data}

df_agg = pd.DataFrame(data_to_add)

In [179]:
df_agg

Unnamed: 0,Criteria,Totals
0,texas_growth,1906146
1,texas_growth_pct_change,7.07%
2,county_most_pct_change,Hays
3,county_most_num_change,Harris
4,number_lost_population,142
