# Population ETL Notebook
1. Census API
2. clean dataset for ML models
3. collect multiple years of population data 
4. predict future population and population growth rate
5. fill future prediction in dataset dataframe
  - prediction of 6 year of future population data
6. save dateframe with predicted values in a csv for use of DS Team 
  - 1 of the features for the DS API (along with crime rate, rental rates and walk score) 

## 1. Census API ETL scripts

In [None]:
import requests
import pandas as pd
from ast import literal_eval

def population_data_api(year_str):

    # convert to str
    year_str = str(year_str)
  
    # look up how to protect our API keys using environmental variables
    census_api_key = 'ca170bc6585e4b20fe39912a9c403931fa7e8196'

    #make API calls with python
    calledAPI = 'https://api.census.gov/data/' + year_str + '/acs/acs5?get=NAME,B01003_001E&for=place:*&in=state:*&key='+ census_api_key

    #call the API and collect the response
    response = requests.get(calledAPI)

    # this converts the str to the literal type
    result_list = literal_eval(response.text)

    # pop the column header from the result_list
    columns = result_list.pop(0)

    # rename columns
    columns = ['NAME', 'POPULATION', 'state', 'place']
    df = pd.DataFrame(result_list, columns=columns)
    df['YEAR'] = int(year_str)
    return df

## 2. Clean dataset for ML models

In [None]:
import pandas as pd

def clean_pop_df(df):
  
    # split CITY_STATE for cleaning and feature engineering 
    df[['CITY','STATE']] = df.NAME.str.split(",",expand=True) 

    # clean the leading white space
    df['STATE'] = df.STATE.str.strip(" ")

    # clean city suffixs and endings
    strip_names = [' city', ' borough', ' town', ' village', ' CDP']

    for i in strip_names:
        df['CITY'] = df.CITY.str.replace(i, "")

    # feature engineering for joining key
    df['City_State'] = df.CITY + ", " + df.STATE

    # prep population df for joining
    pop_df = df[['YEAR', 'City_State', 'POPULATION']]

    return pop_df

## 3. collect multiple years of population data

In [None]:
def fill_10_years_pop_df():
    years = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]
    years = [str(x) for x in years]
    
    dfs = []
    for year in years:
        df = population_data_api(year)
        cleaned_df = clean_pop_df(df)
        dfs.append(cleaned_df)
        
    return dfs

In [None]:
def concat_dfs(all_df):

    # given 10 years of df from 2010 - 2019

    # check shape of one dataset (29514, 5)
    # print(all_df[0].shape)

    # initialize big_df
    big_df = all_df[0]
    # loop and append additional years df
    for i in range(1, 10):
        big_df = pd.concat([big_df, all_df[i]])

    # check size of 10 datasets (295459, 5)
    # print(big_df.shape)

    return big_df

## 4. predict future population and population growth rate

In [None]:
# Import the appropriate estimator class from Scikit-Learn
from sklearn.linear_model import LinearRegression

def predict_pop_growth(user_city_state, big_df):

    # get city state from json
    city_state = user_city_state['City_State']

    # filter big_df
    Graph_df = big_df[big_df['City_State']== city_state]

    #2. Instantiate this class
    model = LinearRegression()

    #3. Arrange X features matrix & y target vector
    features = ['YEAR']
    target = 'POPULATION'

    X_train = Graph_df[features]
    y_train = Graph_df[target]
    # print(X_train.shape, y_train.shape)

    #4. Fit the Model
    model.fit(X_train, y_train)

    #5. Apply the model to new data
    from datetime import datetime
    today = datetime.today()

    # this year prediction
    this_year = today.year
    test_features =[this_year]
    X_test = [test_features]
    y_pred_this_year = model.predict(X_test)
    y_pred_this_year = round(y_pred_this_year[0], 0)
    this_label = 'pop_'+ str(this_year)

    # last year prediction
    last_year = this_year - 1
    test_features =[last_year]
    X_test = [test_features]
    y_pred_last_year = model.predict(X_test)
    y_pred_last_year = round(y_pred_last_year[0],0)
    last_label = 'pop_'+ str(last_year)

    # calculate percent_pop_growth
    percent_pop_growth = (y_pred_this_year - y_pred_last_year)/y_pred_last_year * 100
    percent_pop_growth = round(percent_pop_growth,2)

    return {last_label: y_pred_last_year, this_label: y_pred_this_year, 'percent_pop_growth': percent_pop_growth}

In [None]:
# Import the appropriate estimator class from Scikit-Learn
from sklearn.linear_model import LinearRegression

def predict_pop_growth_gte2020(city_state, year, big_df):
    """
    this is used to fill the dataframe with predictions
    """

    # filter big_df
    Graph_df = big_df[big_df['City_State']== city_state]

    #2. Instantiate this class
    model = LinearRegression()

    #3. Arrange X features matrix & y target vector
    features = ['YEAR']
    target = 'POPULATION'

    X_train = Graph_df[features]
    y_train = Graph_df[target]

    #4. Fit the Model
    model.fit(X_train, y_train)

    #5. Apply the model to new data
    from datetime import datetime
    today = datetime.today()

    test_features =[year]
    X_test = [test_features]
    y_pred = model.predict(X_test)
    y_pred = round(y_pred[0], 0)
    
    return y_pred



## Main

In [None]:
  # MAIN

  all_df = fill_10_years_pop_df() # list of dataframes

  # main
  big_df = concat_dfs(all_df)


In [None]:
  # main - user inpute
  # this works only for one city_state
  user_city_state = {'City_State':'San Francisco, California'}

  results = predict_pop_growth(user_city_state, big_df)
  results 

{'percent_pop_growth': 1.14, 'pop_2020': 889816.0, 'pop_2021': 899932.0}

In [None]:
results['pop_2020']

889816.0

In [None]:
big_df.head()

Unnamed: 0,YEAR,City_State,POPULATION
0,2010,"El Dorado Hills, California",43179
1,2010,"Eldridge, California",1757
2,2010,"El Granada, California",4669
3,2010,"Elizabeth Lake, California",1609
4,2010,"Elk Creek, California",118


## 2021 predictions for all unique city_states

In [None]:
# Main

list_of_cities = big_df['City_State'].unique()

In [None]:
list_of_cities

array(['El Dorado Hills, California', 'Eldridge, California',
       'El Granada, California', ..., 'Howard City (Boelus), Nebraska',
       'Yorkville, Wisconsin', 'Krupp (Marlin), Washington'], dtype=object)

In [None]:
big_df.columns

Index(['YEAR', 'City_State', 'POPULATION'], dtype='object')

In [None]:
year = 2021
df_2021 = pd.DataFrame (list_of_cities,columns=['City_State'])
df_2021['YEAR'] = year

In [None]:
predicted_pop = []

for i in list_of_cities:
   temp = predict_pop_growth_gte2020(i, year, big_df)
   predicted_pop.append(temp)

In [None]:
population_2021

In [None]:
import numpy as np

df_2021['POPULATION'] = np.array(predicted_pop)
df_2021 = df_2021[['YEAR', 'City_State', 'POPULATION']]
df_2021.head()

Unnamed: 0,YEAR,City_State,POPULATION
0,2021,"El Dorado Hills, California",46192.0
1,2021,"Eldridge, California",1208.0
2,2021,"El Granada, California",6695.0
3,2021,"Elizabeth Lake, California",1941.0
4,2021,"Elk Creek, California",239.0


## 6.Persist Dataframe in CSV for use with combined DS TEAM dataset

Save in a csv file

In [None]:
# This is so you don't have to run the whole notebook again
df_2021.to_csv('pop_2021.csv', sep=',', index=False)

In [None]:
df_refill_2021 = pd.read_csv('pop_2021.csv')
df_refill_2021.head()

save in a data dictionary

In [None]:
# https://intellipaat.com/community/20442/python-pandas-dataframe-columns-convert-to-dict-key-and-value

population_2021 = dict(zip(df_2021.City_State, df_2021.POPULATION))

In [None]:
user_call = "Newark, New Jersey"

In [None]:
population_2021[user_call]

In [None]:
# TODO add more years to the list

## 7. predictions function for multiple years from 2020 to 2025

In [None]:
import numpy as np

def future_pred(year, big_df):

    df = pd.DataFrame (list_of_cities,columns=['City_State'])
    df['YEAR'] = year
    predicted_pop = []

    for i in list_of_cities:
      temp = predict_pop_growth_gte2020(i, year, big_df)
      predicted_pop.append(temp)


    df['POPULATION'] = np.array(predicted_pop)
    df = df[['YEAR', 'City_State', 'POPULATION']]
    return df

In [None]:

# TODO similar for the future year 2020 to 2025

def predict_years_pop_df(big_df):
    years = [2020, 2021, 2022, 2023, 2024, 2025]
    # dfs = [df_2020, df_2021, df_2022, df_2023, df_2024, df_2025]
    
    dfs = []
    for year in years:
        df = future_pred(year, big_df)
        dfs.append(df)
    return dfs

In [None]:
# Main

# loop and append additional years df
predict_2020_2025 = predict_years_pop_df(big_df)


In [None]:
len(predict_2020_2025)

In [None]:
for i in predict_2020_2025:
    big_df = pd.concat([big_df, i])

In [None]:
# big_df after add
big_df.shape

In [None]:
big_df['id_num'] = big_df.index
big_df[['id_num', 'YEAR', 'City_State','POPULATION']]

In [None]:
big_df.shape[0]/16

In [None]:
big_df.head()

In [None]:
#### TODO remember to add index before csv
# big_df['id'] = big_df.index
# big_df.head()

In [None]:
big_df.to_csv('pop_2010_2025.csv', sep=',', index=False)