# Region Growth Prediction Model

In [192]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

In [181]:
US_population_data = pd.read_csv('United_States_Census_Data.csv')
US_fertility_data = pd.read_csv('United_States_Fertility_Rates.csv')

In [172]:
northeast_indices = [6, 18, 20, 28, 38, 44, 39, 31, 37]
midwest_indices = [13, 12, 21, 34, 48, 14, 26, 15, 33, 22, 40, 24]
south_indices = [7, 8, 9, 19, 32, 39, 45, 47, 0, 16, 23, 41, 3, 17, 35, 42]
west_indices = [2, 5, 11, 30, 25, 43, 27, 49, 4, 36, 46]

In [173]:
def region_condenser_pop(idx_lst):
    regional_data = [0 for i in range(12)]

    for i in idx_lst:
        data = US_population_data.iloc[i].iloc[2:]  
    
        data_as_integers = data.str.replace(',', '').astype(int)

        storage = data_as_integers.tolist()
        storage.reverse()

        regional_data = [sum(i) for i in zip(regional_data, storage)]
    
    return regional_data


def region_condenser_fertility(idx_lst):
    fertility_data = [0 for i in range(8)]

    for i in idx_lst:
        data = US_fertility_data.iloc[i].iloc[2:] 

        storage = data.tolist()

        fertility_data = [sum(i) for i in zip(fertility_data, storage)]

    
    return fertility_data

# Full Model

In [174]:
pop_years = ['1910 Total', '1920 Total', '1930 Total', '1940 Total', 
             '1950 Total', '1960 Total', '1970 Total', '1980 Total', 
             '1990 Total', '2000 Total', '2010 Total', '2020 Total']

fertility_years = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']


northeast_indices = [6, 18, 20, 28, 38, 44, 39, 31, 37]
midwest_indices = [13, 12, 21, 34, 48, 14, 26, 15, 33, 22, 40, 24]
south_indices = [7, 8, 9, 19, 32, 39, 45, 47, 0, 16, 23, 41, 3, 17, 35, 42]
west_indices = [2, 5, 11, 30, 25, 43, 27, 49, 4, 36, 46]

In [182]:
def regional_growth_percentage(region):
    region_indices = {
        'Northeast': northeast_indices,
        'Midwest': midwest_indices,
        'South': south_indices,
        'West': west_indices,
        'Alaska': [1],
        'Hawaii': [10]
    }
    
    chosen_indices = region_indices.get(region)

    # POPULATION BLOCK START    
    regional_population_data = [0 for i in range(12)]

    for i in chosen_indices:
        population_data = US_population_data.iloc[i].iloc[2:]  
    
        data_as_integers = population_data.str.replace(',', '').astype(int)

        storage = data_as_integers.tolist()
        storage.reverse()

        regional_population_data = [sum(i) for i in zip(regional_population_data, storage)]

        population_df = pd.DataFrame([region_condenser_pop(chosen_indices)], columns=pop_years)

    historical_population_data = population_df.values.flatten().tolist()

    population_model = ARIMA(historical_population_data, order=(0, 0, 0))
    fitted_population_model = population_model.fit()
    population_forecast = fitted_population_model.forecast(steps=1)
    rounded_population_forecast = round(population_forecast[0])
    # POPULATION BLOCK END

    # FERTILITY BLOCK START
    regional_data_fertility = [0 for i in range(8)]

    for i in chosen_indices:
        data_fertility = US_fertility_data.iloc[i].iloc[2:]  
    
        fertility_storage = data_fertility.tolist()

        regional_data_fertility = [sum(i) for i in zip(regional_data_fertility, fertility_storage)]

        fertility_df = pd.DataFrame([region_condenser_fertility(chosen_indices)], columns=fertility_years)

    previous_data_fert = fertility_df.values.flatten().tolist()

    model_fertility = ARIMA(previous_data_fert, order=(0, 0, 0))
    fitted_model_fertility = model_fertility.fit()


    forecast_fertility = fitted_model_fertility.forecast(steps=1)
    rounded_forecast_fertility = round(forecast_fertility[0])
    # FERTILITY BLOCK END

    #Calculates Growth Percentage of a Region Within the Next Ten years
    growth_percentage = (rounded_forecast_fertility/rounded_population_forecast) * 100

    return round(growth_percentage, 4)
    