# Region Growth Model

In [4]:
import pandas as pd
import random
from sklearn import preprocessing
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
from statsmodels.tsa.arima.model import ARIMA

In [3]:
US_population_data = pd.read_csv('United_States_Census_Data.csv')
US_fertility_data = pd.read_csv('United_States_Fertility_Rates.csv')
US_median_income_data = pd.read_csv('United_States_Median_Income_by_State.csv')
US_unemployment_data = pd.read_csv('United_States_Unemployment_Data.csv')

In [6]:
northeast_indices = [6, 18, 20, 28, 38, 44, 39, 31, 37]
midwest_indices = [13, 12, 21, 34, 48, 14, 26, 15, 33, 22, 40, 24]
south_indices = [7, 8, 9, 19, 32, 39, 45, 47, 0, 16, 23, 41, 3, 17, 35, 42]
west_indices = [2, 5, 11, 30, 25, 43, 27, 49, 4, 36, 46]

In [99]:
pop_years = ['1910 Northeast Total', '1920 Northeast Total', '1930 Northeast Total', '1940 Northeast Total', 
             '1950 Northeast Total', '1960 Northeast Total', '1970 Northeast Total', '1980 Northeast Total', 
             '1990 Northeast Total', '2000 Northeast Total', '2010 Northeast Total', '2020 Northeast Total']

def region_condenser_pop(idx_lst):
    regional_data = [0 for i in range(12)]

    for i in idx_lst:
        data = US_population_data.iloc[i].iloc[2:]  
    
        data_as_integers = data.str.replace(',', '').astype(int)

        storage = data_as_integers.tolist()
        storage.reverse()

        regional_data = [sum(i) for i in zip(regional_data, storage)]
    
    return regional_data

In [108]:
northeast_train_origin_df = pd.DataFrame([region_condenser_pop(northeast_indices)], columns=pop_years)

In [109]:
northeast_train_origin_df

Unnamed: 0,1910 Northeast Total,1920 Northeast Total,1930 Northeast Total,1940 Northeast Total,1950 Northeast Total,1960 Northeast Total,1970 Northeast Total,1980 Northeast Total,1990 Northeast Total,2000 Northeast Total,2010 Northeast Total,2020 Northeast Total
0,24846806,28189877,32124522,33716416,36759684,40993631,44463055,44892280,46565744,49192040,51150710,53438579


**Note:** We will be conducting the following train and test split. 

**Train:** 1910 Northeast Total - 1980 Northeast Total

**Test:** 1990 Northeast Total - 2020 Northeast Total

In [110]:
training_set = northeast_train_origin_df[['1910 Northeast Total', '1920 Northeast Total', '1930 Northeast Total', 
                                          '1940 Northeast Total', '1950 Northeast Total', '1960 Northeast Total', 
                                          '1970 Northeast Total', '1980 Northeast Total',]]
test_set = northeast_train_origin_df[['1990 Northeast Total', '2000 Northeast Total', '2010 Northeast Total', '2020 Northeast Total']]

In [111]:
data = training_set.iloc[0].values.tolist()
data_2 = test_set.iloc[0].values.tolist()

[24846806, 28189877, 32124522, 33716416, 36759684, 40993631, 44463055, 44892280]
[46565744, 49192040, 51150710, 53438579]


In [118]:
training_set.values

array([[24846806, 28189877, 32124522, 33716416, 36759684, 40993631,
        44463055, 44892280]])

In [119]:
# Meant to test how this particular model should be tuned when there is a known answer
training_values = training_set.values.flatten().tolist()

test_model = ARIMA(training_values, order=(1, 0, 0))
fitted_model = test_model.fit()


forecast = fitted_model.forecast(steps=len(data_2))

def calculate_mape(data_2, data):
    return (abs((data_2 - data) / data_2)) * 100

mape_scores = [calculate_mape(actual, predicted) for actual, predicted in zip(data_2, data)]

overall_mape = sum(mape_scores) / len(mape_scores)


for year, mape in zip([1990, 2000, 2010, 2020], mape_scores):
    print(f"MAPE for year {year}: {mape:.2f}%")

print(f"Overall MAPE: {overall_mape:.2f}%")

MAPE for year 1990: 46.64%
MAPE for year 2000: 42.69%
MAPE for year 2010: 37.20%
MAPE for year 2020: 36.91%
Overall MAPE: 40.86%


In [None]:
training_values = training_set.values.flatten().tolist()

deployed_model = ARIMA(training_values, order=(1, 0, 0))
fitted_model = test_model.fit()


forecast = fitted_model.forecast(steps=len(data_2))

def calculate_mape(data_2, data):
    return (abs((data_2 - data) / data_2)) * 100

mape_scores = [calculate_mape(actual, predicted) for actual, predicted in zip(data_2, data)]

overall_mape = sum(mape_scores) / len(mape_scores)


for year, mape in zip([1990, 2000, 2010, 2020], mape_scores):
    print(f"MAPE for year {year}: {mape:.2f}%")

print(f"Overall MAPE: {overall_mape:.2f}%")