In [None]:
# Import libraries

import sys
import re
import requests
import json
import asyncio
import nest_asyncio
import aiohttp
import time
import warnings
from datetime import datetime
from dateutil.relativedelta import relativedelta

# Suppress unnecessary Shapely warning
warnings.filterwarnings('ignore',
                        '.*Shapely GEOS version.*')

from aiohttp import ClientSession
from requests import request, Session
from itertools import product, repeat
import os
from dotenv import load_dotenv
from os import getenv
from threading import Thread
import time
import inspect
import pandas as pd
import geopandas as gp
import shapely
import pygeos
import contextily as cx
from functools import reduce
from pandas.plotting import lag_plot
import pickle
import numpy as np
import seaborn as sns
import datetime as dt
import copy
import math
from decimal import Decimal
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.ticker as mticker
from matplotlib.ticker import MaxNLocator
import matplotlib.dates as mdates
import matplotlib.gridspec as gridspec
from matplotlib.gridspec import GridSpec
from matplotlib.offsetbox import AnchoredText
import matplotlib as mpl
import plotly.express as px
from sklearn.metrics import mean_absolute_error
from multiprocess import Process, Pool

from sklearn.linear_model import LinearRegression, Ridge

# Set up Pandas defaults
pd.options.display.float_format = '{:.6f}'.format
pd.set_option("display.max_columns", None)



### Define Script Variables

In [None]:
# Get the beginning year for job data. If it is past June,
# just analyze this year's job growth data from the beginning
# of the year to the present. If it is
# before June, analyze data from the beginning of last
# year to the present. Note below we test to see if
# it is after July, because if it is, we should have June's
# data at that point.
now = datetime.today()
if now.month > 7:
    begin_job_year = now.year
else:
    begin_job_year = now.year - 1

# Get the most recent year of census data
def get_end_year():
    """
    The ACS typically releases the previous year's
    data on Dec 15th. So we will determine the end_year
    based on the current date. For example, if it's
    August 8th, 2023, that is before December 15th, 2023.
    So in this case, we will make the end year 2021, as
    the 2022 data will likely be released by Dec 15th,
    2023. Otherwise, if we are accessing this data between
    Dec 15th and the end of the current year, we will take
    the current year and subtract by 1, as the previous
    year's data is likely released.
    """

    # Get today
    now = datetime.today()

    # get this year
    this_year = now.year

    # Make Dec 15th
    dec_15 = datetime(this_year, 12, 15)

    # If it's before Dec 15th of this year,
    # the end_year will be the current year
    # subtracted by 2 (see function docstring)
    if now < dec_15:
        end_year = this_year - 2
    else:
        end_year = this_year - 1

    return end_year

# Get the census end year
census_end_year = get_end_year()

# Census beginning year is end_year - 5
census_begin_year = census_end_year - 5

In [None]:
### Read in all helper codes

# Read in state FIPS codes
state_fips = pd.read_csv(
    "datasets/helper_datasets/state_FIPS_codes.csv",
    dtype={'state_code':str}
)

# Read in MSA state codes
msa_state_fips = pd.read_csv(
    "datasets/helper_datasets/msa_and_state_codes.csv",
    dtype={'FIPS State Code':str, 'CBSA Code':str}
)

# Get only necessary columns
msa_state_fips = msa_state_fips[['CBSA Code','CBSA Title','FIPS State Code']]

# Rename column
msa_state_fips.rename(columns={'FIPS State Code':'state_code'}, inplace=True)

# Here is where to read in the crosswalk file
msa_city_crosswalk = pd.read_csv("datasets/helper_datasets/msa_to_city_crosswalk.csv",
                                 dtype={"msa_geoid":str, "city_geoid":str})
msa_city_crosswalk

### Define Helper Functions

In [None]:
# Define helper function to create directory
def create_folder(the_path):
    if not os.path.isdir(the_path):
        os.mkdir(the_path)
        
# Turn into datetime format
def turn_df_into_datetime(
    dataframe,
    msa=False,
    city=False
):
    """
    Turns a dataframe created by the API functions
    into a tidy datetime format.
    """
    # Make a copy
    df = dataframe.copy()
    
    # Set index
    if msa:
        df = df.set_index(['msa_name','msa_code'])
    elif city:
        df = df.set_index(['name','geo_id'])
    else:
        raise ValueError("Please define MSA or City in arguments.")
    
    # Stack
    df = df.stack()
    
    # Turn into dataframe
    df = pd.DataFrame(df).reset_index()
    
    # Rename the post-stacked columns
    df.rename(columns={'level_2':'year', 0:'value'}, inplace=True)
    
    # Make year column integer
    df['year'] = df['year'].astype(int)
    
    # Make datetime column
    df['date'] = pd.to_datetime(df['year'], format='%Y')
    
    return df

In [None]:
### Helper function for Census Datasets
def prep_census_datasets(
    dataframe, 
    msa=False,
    city=False,
    msa_state_fips=msa_state_fips,
    msa_city_cross_walk=msa_city_crosswalk,
):
    """
    This function preps the census datasets into
    the same format as the BLS job datasets
    for future city comparisons using both
    Census and BLS datasets.
    """
    # Make copy
    df = dataframe.copy()
    
    # Turn into tidy datetime
    df = turn_df_into_datetime(
        df, msa=msa, city=city)

    # Add MSA state code
    if msa:
        df = df.merge(msa_state_fips, 
                        how='left', 
                        left_on=['msa_name','msa_code'],
                        right_on=['CBSA Title','CBSA Code'])
        
        # Drop unnecessary columns
        df.drop(columns=['CBSA Title','CBSA Code'], inplace=True)

        # Replace NECTA Division
        df['msa_name'] = df['msa_name'].apply(lambda x: x.replace(" NECTA Division",""))
        df['msa_name'] = df['msa_name'].apply(lambda x: x.replace(" NECTA",""))
        
    elif city:
        df = df.merge(msa_city_cross_walk,
                      how='left',
                      left_on=['geo_id'],
                      right_on=['city_geoid'])
        
        # Create state name column
        df['state'] = df['name'].apply(lambda x: str(x.split(", ")[-1]))

        # Drop unnecessary columns
        df.drop(columns=['name','geo_id'], inplace=True)
        
        # Drop city geo_ids that are not part of any MSA
        df = df[df['city_geoid'].notna()].reset_index(drop=True)
        
        # Remake city name column
        df['city_name'] = df['city_name'] + ", " + df['state']
        
    else:
        raise ValueError("Please specify MSA or City in the arguments.")

    
    return df

In [None]:
### Define normalizing function
def normalize_column(
    series, mean_standardize=False, 
    min_max_standardized=False):
    """
    Normalizes a column's values.
    
    Arguments
    -----------
        series (Series): A pandas Series, which can
            simply be passed as a column of a
            DataFrame.
            
    Returns
    -----------
        series (Series): A normalized Series, which can
            be set to a column in a DataFrame.
    """
    # Make a copy
    sr = series.copy()
    
    # Standardize around the mean or by min-max
    if mean_standardize:
        # Make normalized column
        sr = (sr - sr.mean())/sr.std()
    elif min_max_standardized:
        # Make normalized column
        sr = (sr - sr.min())/(sr.max()-sr.min())
    else:
        raise ValueError("Please specify how to normalize.")
    
    return sr

## Read in City Datasets

1. Population (B01003_001E)
2. Median Income
3. Median Unit Price
4. Median Rent
5. Total Units
6. Percent Renter Occupied
7. Total Employed (B23025_004E)
8. Rent-to-Price Ratio
9. People-per-Units

In [None]:
# Create list of demographics to iterate through
demo_list = [
    'population',
    'median_income',
    'median_price',
    'median_rent',
    'people_per_unit',
    'percent_renter_occupied',
    'population',
    'rent_price_ratio',
    'total_employed',
]

# Create dictionary to save dataframes to
demo_dict = {}

# Loop through demos and read in dataframes
for demo in demo_list:
    
    # Read in dataframe
    df = pd.read_csv(
        f"datasets/cleaned_census_api_files/city_data/{demo}_city.csv",
        dtype={'geo_id':str}
    )

    # Run prep function to get into correct format
    df = prep_census_datasets(
        df, city=True)

    # Add dataframe to dictionary
    demo_dict[demo] = df
    

#### Read in Job data at the MSA level

In [None]:
# Read in most recent job data
jobs = pd.read_csv('datasets/bls/raw/most_recent_bls_data.csv',
                   dtype={'msa_code':str, 'state_code':str})

# Make sure the date column is in datetime format
jobs['date'] = pd.to_datetime(jobs['date'])

# Replace NECTA Division
jobs['msa_name'] = jobs['msa_name'].apply(lambda x: x.replace(" NECTA Division",""))
jobs['msa_name'] = jobs['msa_name'].apply(lambda x: x.replace(" NECTA",""))

# Call in and merge crosswalk
crosswalk = pd.read_csv("datasets/helper_datasets/msa_to_city_crosswalk.csv",
                       dtype={'msa_geoid':str, 'city_geoid':str})

# Only keep IDs
crosswalk = crosswalk[['msa_geoid','city_geoid']]

# Rename jobs' msa column
jobs.rename(columns={'msa_code':'msa_geoid'}, inplace=True)

# Merge crosswalk to job data
jobs = jobs.merge(crosswalk, 
                          how='left', 
                          on='msa_geoid')

jobs

### Define Graphing Functions

In [None]:
# Define helper function that runs linear regression
def run_lr(df, column):
    """
    Run linear regression on time-series data 
    and return the coefficient and intercept.
    
    Arguments
    -----------
        df (DataFrame): A dataframe that contains the
            target column and an 'ordinal_date' column
            that was created by a time-series column in 
            the format of "%Y-%m-%d" and making it ordinal,
            such as running the code below in some other 
            step. 
            
            EXAMPLE...
            # Create ordinal column
            df['ordinal_date'] = df['date'].map(
                datetime.toordinal)
                
        column (str): The name of the target column.
            
    Returns
    -----------
        coef (float): The coefficient of the linear
            equation calculated.
        
        intercept (float): The y-intercept of the linear
            equation calculated.
    
    """
    # Run linear regression
    normal_lr = LinearRegression()
    X = df[['ordinal_date']]
    y = df[column]
    normal_lr.fit(X, y)
    coef = normal_lr.coef_[0]
    intercept = normal_lr.intercept_

    # Return lr coefficient
    return coef, intercept

### Define Plotting Functions

In [None]:
# Loop through all cities, sort by coefficient, plot top 10
def plot_top_10_cities(
    ranked_cities,
    plotting_msa=False,
    plotting_city=False,
    plot_jobs=False,
    plot_rent=False,
    plot_income=False,
    plot_price=False,
    plot_units=False,
    plot_rent_to_price=False,
    plot_jobs_per_unit=False,
    begin_year_1=2013,
    plot_all=False,
):
    """
    Plots the top cities for a given demographic. Top cities
    are chosen based on their trend. This function can also 
    find the top cities based on multiple datasets.
    
    Arguments
    ----------
        ranked_cities (DataFrame): A dataframe of cities
            already ranked by various demographics. The
            dataframe returned by the "make_ranking()"
            function is the ideal dataframe to pass
            to this function.
            
        plot_jobs (True/False): If Ture, plot jobs. Only one 
            demographic can be plotted at a time, so if you'd 
            like to plot a different demographic, this must 
            be set to False.
        
        plot_rent (True/False): If Ture, plot rent. Only one 
            demographic can be plotted at a time, so if you'd 
            like to plot a different demographic, this must 
            be set to False.
        
        plot_income (True/False): If Ture, plot income. Only one 
            demographic can be plotted at a time, so if you'd 
            like to plot a different demographic, this must 
            be set to False.
        
        plot_price (True/False): If Ture, plot price. Only one 
            demographic can be plotted at a time, so if you'd 
            like to plot a different demographic, this must 
            be set to False.
        
        plot_units (True/False): If Ture, plot units. Only one 
            demographic can be plotted at a time, so if you'd 
            like to plot a different demographic, this must 
            be set to False.
        
        plot_rent_to_price (True/False): If Ture, plot rent-ro-price
            ratio. Only one demographic can be plotted at a time, 
            so if you'd like to plot a different demographic, 
            this must be set to False.
        
        plot_jobs_per_unit (True/False): If Ture, plot jobs-per-unit. 
            Only one demographic can be plotted at a time, so if 
            you'd like to plot a different demographic, this must 
            be set to False.
            
        begin_year_1 (int): The year you'd like the
            analysis to start.
            
        plot_all (True/False): True if you want to plot every
            city in the dataframe (slow). False if you only want
            to plot the top 5 (fast).
    
    Returns
    ----------
        coef_df (DataFrame): A dataframe with the rankings
            of each city, from "best to worst."
    
    """
    # Make a copy of the ranked cities
    ranked = ranked_cities.copy()
    
    # Get folder name variable
    if plotting_msa:
        file_geo = 'msa'
    elif plotting_city:
        file_geo = 'city'
    else:
        raise ValueError("Please define MSA or City as arguments.")
    
    # If not plotting all cities (and just top 10),
    # keep only the top 10 cities in the dataframe
    if not plot_all:
        ranked = ranked.head(10)
    
    ### Call in the dataset we will be graphing from
    
    # If plotting job growth
    if plot_jobs:
        
        # Set demographic title for graphs
        demographic_1="Job"
        
        # Read in most recent job data
        dataframe_1 = pd.read_csv('datasets/bls/raw/most_recent_bls_data.csv',
                           dtype={'msa_code':str, 'state_code':str})

        # Make sure the date column is in datetime format
        dataframe_1['date'] = pd.to_datetime(dataframe_1['date'])

        # Replace NECTA Division
        dataframe_1['msa_name'] = dataframe_1['msa_name'].apply(lambda x: x.replace(" NECTA Division",""))
        dataframe_1['msa_name'] = dataframe_1['msa_name'].apply(lambda x: x.replace(" NECTA",""))
    
    # If plotting rent growth
    elif plot_rent:
        
        # Set demographic title for graphs
        demographic_1="Median Rent"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            f"datasets/cleaned_census_api_files/{file_geo}_data/median_rent_{file_geo}.csv",
            dtype={'msa_code':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, msa=msa, city=city)
        
    # If plotting income growth
    elif plot_income:
        
        # Set demographic title for graphs
        demographic_1="Median Income"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            f"datasets/cleaned_census_api_files/{file_geo}_data/median_income_{file_geo}.csv",
            dtype={'msa_code':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, msa=msa, city=city)
        
    # If plotting price growth
    elif plot_price:
        
        # Set demographic title for graphs
        demographic_1="Median Price"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            f"datasets/cleaned_census_api_files/{file_geo}_data/median_price_{file_geo}.csv",
            dtype={'msa_code':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, msa=msa, city=city)
        
    # If plotting unit growth
    elif plot_units:
        
        # Set demographic title for graphs
        demographic_1="Total Units"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            f"datasets/cleaned_census_api_files/{file_geo}_data/total_units_{file_geo}.csv",
            dtype={'msa_code':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, msa=msa, city=city)
        
    # If plotting rent-to-price
    elif plot_rent_to_price:
        
        # Set demographic title for graphs
        demographic_1="Rent-to-Price"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            f"datasets/cleaned_census_api_files/{file_geo}_data/rent_price_ratio_{file_geo}.csv",
            dtype={'msa_code':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, msa=msa, city=city)
        
    # If plotting jobs-per-unit
    elif plot_jobs_per_unit:
        
        # Set demographic title for graphs
        demographic_1="Jobs per Unit"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            f"datasets/cleaned_census_api_files/{file_geo}_data/jobs_per_unit_{file_geo}.csv",
            dtype={'msa_code':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, msa=msa, city=city)
        
    # Otherwise, print error statement
    else:
        print("Please specify a demographic to plot by setting it to True, and leaving the others set to False.")
        raise Exception("SPECIFY A DEMOGRAPHIC TO PLOT.")
    
    # Make copy
    main_df = dataframe_1.copy()
            
    # Create main variable to use for the rest of the script
    column = 'value'

    # Create dictionary to store filtered dataframes
    filtered_dict = {}

    # Set y_lim list to find max and min
    y_lim_list_trend = []
    y_min_list_trend = [0]
    y_lim_list_pct = []
    y_min_list_pct = [0]
    
    # Loop through all cities in the ranked dataframe
    for city in ranked['msa_name'].dropna().unique():

        # Isolate just that city
        df = main_df[main_df['msa_name']==city].copy()
        
        # Sort by date
        df = df.sort_values('date')
                
        # Create difference column
        df['value_change'] = df['value'].diff()
        
        # Create pct_change column
        df['percent_change'] = df['value'].pct_change()
        
        # Filter by beginning year
        df = df[df['year']>=begin_year_1].reset_index(drop=True)
                
        # If an MSA's most recent year is after the beginning
        # year, remove it from the graphs. For example, if we want to
        # view the growth of all cities since 2016, but Prescott Valley
        # only has data starting at 2019, this may skew the data.
        if df['year'].iloc[0] != begin_year_1:
            print(f"Dropping {city}, it's dataframe has a smaller window.")
            continue
                
        # Remove NaN values
        df = df[df['percent_change'].notna()]
        
        # Isolate date and value columns
        df = df[[
            'date', 'value', 'value_change',
            'percent_change']].reset_index(drop=True)
        
        # Add this dataframe's y_lim to list
        y_lim_list_trend.append(df['value'].max())
        y_min_list_trend.append(df['value'].min())
        y_lim_list_pct.append(df['percent_change'].max())
        y_min_list_pct.append(df['percent_change'].min())
        
        # get next months's datetime
        next_year = df['date'].iloc[-1] + relativedelta(months=1)
        
        # Create ordinal column
        df['ordinal_date'] = df['date'].map(datetime.toordinal)
        
        # Run linear regression and get the trend's coefficient
        coef_value, intercept_value = run_lr(df, column='value')
        coef_pct, intercept_pct = run_lr(df, column='percent_change')
        
        # Create next year's date
        df.loc[len(df.index)] = [
            next_year, np.nan, np.nan, 
            np.nan, datetime.toordinal(next_year)]
        
        # Create averages column
        the_average_pct = df['percent_change'].mean()
        df['average_pct'] = the_average_pct
        the_average_value = df['value_change'].mean()
        df['average_value'] = the_average_value

        # Fill in with linear regression values.
        # Also add highest trend value to lim_list.
        df['value_trend'] = df['ordinal_date']*coef_value + intercept_value
        df['percent_change_trend'] = df['ordinal_date']*coef_pct + intercept_pct

        # Also add highest trend value to lim_list
        y_lim_list_trend.append(df['value_trend'].max())
        y_lim_list_pct.append(df['percent_change_trend'].max())

        # Get the y_lim
        y_lim_trend = max(y_lim_list_trend) * 1.1
        y_min_trend = min(y_min_list_trend)
        y_lim_pct = max(y_lim_list_pct) * 1.1
        y_min_pct = min(y_min_list_pct)
            
        # Save filtered data to dictionary
        filtered_dict[city] = df
    
    # Loop through each city in the ranked df
    for city_name in ranked['msa_name']:

        # Get the job data
        df = filtered_dict[city_name]    
            
        # Make a grid to plot 2 graphs on
        fig = plt.figure(figsize=(12,3), dpi=300)
        gs = GridSpec(nrows=1, ncols=2)
        ax1 = fig.add_subplot(gs[0,0])
        ax2 = fig.add_subplot(gs[0,1])
        ax_list = [ax1, ax2]
        
        # Set grid
        plt.style.use('seaborn-whitegrid')

        # Set title
        fig.suptitle(f"{city_name}\n\n\n", 
             fontweight="bold")

        # Plot first graph
        ax1 = df.plot(x='date',y='value', ax=ax1)
        ax1 = df.plot(x='date',y='value_trend', ax=ax1, linestyle="--")

        # Plot second graph
        ax2 = df.plot(x='date',y='percent_change', ax=ax2)
        ax2 = df.plot(x='date',y='percent_change_trend', ax=ax2, linestyle="--")

        # Second graph's zero line
        df['zero'] = 0
        ax2 = df.plot(x='date', y='zero', ax=ax2, color="grey")

        # Also plot the average line
        ax2 = df.plot(x='date', y='average_pct', 
                      ax=ax2, color="black", linestyle="-")

        # Set title's for both graphs
        ax1.set_title(f"{demographic_1} Growth")
        ax2.set_title(f"Percent Change in {demographic_1}")

        # Set y lims and y ticks
        ax1.set_ylim([y_min_trend, y_lim_trend])
        ax2.set_ylim([y_min_pct, y_lim_pct])

        # Set y limits
        y_tick_list_trend = [
            y_lim_trend*0.25, y_lim_trend*0.5, 
            y_lim_trend*0.75, y_lim_trend]
        y_tick_list_pct = [
            y_min_pct, y_min_pct*0.5, 0, 
            y_lim_pct*0.5, y_lim_pct]

        # Set y_ticks
        ax1.yaxis.set_major_locator(
            mticker.FixedLocator(y_tick_list_trend))
        ax2.yaxis.set_major_locator(
            mticker.FixedLocator(y_tick_list_pct))

        # Set y-tick labels
        ax1.set_yticklabels(
            ['{:,}'.format(round(float(x), 3)) for x in y_tick_list_trend])
        ax2.set_yticklabels(
            ['{:,}'.format(round(float(x), 3)) for x in y_tick_list_pct])
        
        # Give suptitle more room
        fig.subplots_adjust(top=0.85)

        # Create folder to save graphs into
        create_folder("graphs")
        create_folder("graphs/msa_graphs")
        create_folder(f"graphs/msa_graphs/{city_name}")

        # Create filepath to save graph to
        save_filepath = f"graphs/msa_graphs/{city_name}/{demographic_1} Growth.png"

        # Save the graphs
        plt.savefig(save_filepath)
        
        # Show plot
        if not plot_all:
            plt.show()
        
        # Clear plot
        plt.close("all")




### Define Ranking Function

In [None]:
### MAKE FUNCTION THAT MAKES A TOTAL RANK 
### BASED ON MULTIPLE DEMOGRAPHICS

def make_city_trend_dataframes(
    df_dict,
    rank_msa=False,
    rank_city=False,
):
    """
    This function ranks the invest-ability of every
    city based on the demographics passed. It
    analyzes the total average growth per year, as 
    well as the relative average growth per year
    (measured as the average percent growth per year).
    
    Arguments
    -----------
        df_dict (dict): A dictionary to be used if you
            want to combine multiple dataframes for analysis
            and plotting. If using this, the key should be
            a string with the demographic name, and the value
            should be a list containing the dataframe in position
            0, and the beginning year in position 1. See below
            for two examples...
            
            Example 1, One Extra Dataframe
            {"Median Rent": [median_rent_df, 2013]}
            
            Example 2, Multiple Extra Dataframes
            {"Median Rent": [median_rent_df, 2013],
            "Population" : [population_df, 2013]}
            
        max_price (int): If you only want to measure and
            compare MSAs up to a certain median price,
            enter the max median price as an integer.
            
        min_rent_to_price (float): If you only want to measure and
            compare MSAs up to a certain rent-to-price ratio
            (based on median rent and median price values),
            enter the minimum rent-to-price ratio as a float.
            
        use_total_trend (True/False): Set to True if you'd like
            to include the total trend weights in the ranking
            of MSAs. Use False if not.
        
        use_average_percent (True/False): Set to True if you'd like
            to include the average percent weights in the ranking
            of MSAs. Use False if not.
        
        total_trend_weight_dict (dict): A dictionary to set the
            weights of each demo. For example, if you'd like to
            multiply the "Median Rent" weights by 2, giving a bigger
            weight to the "Median Rent" demographic, all you need
            is to make the key "Median Rent" set to a value of 2.
            This dictionary is specifically for total trend weights.
            See the example below.
            
            EXAMPLE...
            total_trend_weight_dict={
                "Jobs":1,
                "Median Rent":1}
        
        average_percent_weight_dict (dict): A dictionary to set the
            weights of each demo. For example, if you'd like to
            multiply the "Median Rent" weights by 2, giving a bigger
            weight to the "Median Rent" demographic, all you need
            is to make the key "Median Rent" set to a value of 2.
            This dictionary is specifically for average percent weights.
            See the example below.
            
            EXAMPLE...
            average_percent_weight_dict={
                "Jobs":3,
                "Median Rent":3}
                
        plot_graphs (True/False): If True, ask for user inputs
            and run the plot_top_10_cities() function.
            
    Returns
    -----------
        final_df (DataFrame): A dataframe with each city
            sorted by total rank.
    """
    # Make a list to add each dataframe to
    df_list = []
    
    # Define geo_name
    if rank_msa:
        geo_name = 'msa_name'
        geo_file = 'msa'
        geo_id = 'msa_geoid'
    elif rank_city:
        geo_name = 'city_name'
        geo_file = 'city'
        geo_id = 'city_geoid'
    else:
        raise ValueError("Please define MSA or City in the arguments.")
    
    # Rename all columns by appending the demo name,
    # except for the MSA name and date, which we will 
    # use as the key to merge all dataframes.
    for demo in df_dict:
        
        if demo != "Jobs":
        
            # Get dataframe
            df = df_dict[demo][0].copy()

            # Rename select columns
            df.rename(columns={'value':f'value_{demo}',
                              'year':f'year_{demo}'}, inplace=True)

            # Drop state
            if 'state' in df.columns:
                df.drop(columns=['state'], inplace=True)
                
            # Add dataframe to list
            df_list.append(df)

                
    # Define list of columns to merge on
    merge_list = ['msa_name','date','msa_geoid','city_geoid','city_name']
                
    # Merge all dataframes
    merged_df = reduce(lambda left, right: 
                       pd.merge(left, right, 
                                left_on=merge_list, 
                                right_on=merge_list,
                                suffixes=(None, "_y"),
                                how="outer"), df_list)
        
    # Now merge Jobs into the dataframe
    if "Jobs" in df_dict:
        df = df_dict["Jobs"][0].copy()
        
        # Rename select columns
        df.rename(columns={'value':f'value_Jobs',
                          'year':f'year_Jobs'}, inplace=True)
        
        # Clean Jobs
        df = df[['msa_name','date','msa_geoid','city_geoid',
                 'year_Jobs','value_Jobs']]
        
        # Make another merged list
        merge_list = ['msa_name','date','msa_geoid','city_geoid']
        
        # Merge jobs dataframe
        merged_df = merged_df.merge(df, 
                                    how='outer',
                                    left_on=merge_list,
                                    right_on=merge_list
                                   )
        
        
    
    
#     display(merged_df[
#         (merged_df['city_geoid']=='0100820')
#         & (merged_df['date']>'2020-12-01')
#     ])
    
        
    # Loop through columns and clean out the rest
    for col in merged_df.columns:
        if ('month_' in col) | ('series_id_' in col):
            merged_df.drop(columns=[col], inplace=True)
    
    # Create new df to store coefficients
    coef_df = pd.DataFrame(
        data=None, columns=['city_name', 'city_geoid','msa_name','msa_geoid'])
    
    # Add columns for every demo
    for demo in df_dict:
        coef_df[f'trend_coef_{demo}'] = None
        coef_df[f'average_value_{demo}'] = None
        coef_df[f'average_pct_{demo}'] = None
        
    # Make temporary coef_df to use later
    temp_coef_1 = coef_df.copy()
        
    # Create ordinal column
    merged_df['ordinal_date'] = merged_df['date'].map(datetime.toordinal)
    
    # Loop through all cities
    for city_geoid in merged_df[geo_id].dropna().unique():
        
        # Isolate just that city
        df = merged_df[merged_df[geo_id]==city_geoid].copy()
        
        # Get geo_id
        the_geoid = df['city_geoid'].iloc[0]
        
        # Get city name
        city = df['city_name'].iloc[0]
        
        # Get MSA geoid
        msa_geoid = df['msa_geoid'].iloc[0]
        
        # Get MSA name
        msa_name = df['msa_name'].iloc[0]
        
        
        # If there are population numbers less than 10,000, just
        # don't include the city
        if df['value_Population'].min() < 10_000:
            continue
        
        # Sort by date
        df = df.sort_values('date')
        
        # Make duplicate
        temp_coef_2 = temp_coef_1.copy()
        
        # Set temp coef_df
        temp_coef_2.loc[len(temp_coef_2.index)] = np.nan
        temp_coef_2['city_name'] = city
        temp_coef_2['city_geoid'] = the_geoid
        temp_coef_2['msa_name'] = msa_name
        temp_coef_2['msa_geoid'] = msa_geoid
        
        
        # Loop through each demo
        for demo in df_dict:
                        
            # Test to see if there's data for the demo
            if df[df[f'year_{demo}'].notna()].shape[0] > 0:
                                
                # Make copy
                df_temp = df.copy()
                
                # Get beginning year
                begin_year = df_dict[demo][1]
                                    
                # Filter by beginning year minus 1
                df_temp = df[df[f'year_{demo}']>=begin_year-1].reset_index(drop=True)
                        
                # Create difference column
                df_temp[f'value_change_{demo}'] = df_temp[f'value_{demo}'].diff()

                # Create pct_change column
                df_temp[f'percent_change_{demo}'] = df_temp[f'value_{demo}'].pct_change()
                                
                # Filter by beginning year
                df_temp = df_temp[
                    df_temp[f'year_{demo}']>=begin_year].reset_index(drop=True)
                
                # If an MSA's most recent year is after the beginning
                # year, remove it from the graphs. For example, if we want to
                # view the growth of all cities since 2016, but Prescott Valley
                # only has data starting at 2019, this may skew the data.
                if df_temp[f'year_{demo}'].iloc[0] != begin_year:
                    continue

                # Remove NaN values
                df_temp = df_temp[df_temp[f'percent_change_{demo}'].notna()]
                
                # If there are no non-NaN percent-change values,
                # remove it from the graphs. This can happen due to
                # every value being 0 for every year.
                if df_temp.shape[0] == 0:
                    continue
                    

                                    
                # Run linear regression
                coef_value, intercept_value = run_lr(df_temp, column=f'value_{demo}')
                coef_pct, intercept_pct = run_lr(df_temp, column=f'percent_change_{demo}')

                # Create trend columns
                df_temp[f'value_trend_{demo}'] = df_temp['ordinal_date']*coef_value + intercept_value
                df_temp[f'percent_change_trend_{demo}'] = df_temp['ordinal_date']*coef_pct + intercept_pct

                # Create averages column
                the_average_pct = df_temp[f'percent_change_{demo}'].mean()
                df_temp[f'average_pct_{demo}'] = the_average_pct
                the_average_value = df_temp[f'value_change_{demo}'].mean()
                df_temp[f'average_value_{demo}'] = the_average_value
                
                # Update temp coef
                temp_coef_2[f'trend_coef_{demo}'] = coef_value
                temp_coef_2[f'average_value_{demo}'] = the_average_value
                temp_coef_2[f'average_pct_{demo}'] = the_average_pct
            
        # Append temp coef to dataframe
        coef_df = pd.concat([coef_df, temp_coef_2])
            
    # Drop duplicates
    coef_df = coef_df.drop_duplicates().reset_index(drop=True)
    
    # Make folder to save coef_df to
    trend_folder = "datasets/cleaned_census_api_files/city_data/city_trend_datasets/"
    create_folder(trend_folder)
    
    # Create variable string for file naming
    var_string = "all_city_demographic_trends.csv"
        
    # Save coef_df
    coef_df.to_csv(f"{trend_folder}/{var_string}", index=False)
    
    return



In [None]:
# Make initial trend dataframe with every demographic
make_city_trend_dataframes(
    df_dict={
        "Jobs":[jobs, begin_job_year],
        "Population":[demo_dict['population'], census_begin_year],
        "Median Income":[demo_dict['median_income'], census_begin_year],
        "Median Price":[demo_dict['median_price'], census_begin_year],
        "Median Rent":[demo_dict['median_rent'], census_begin_year],
        "People per Unit":[demo_dict['people_per_unit'], census_begin_year],
        "Percent Renter Occupied":[demo_dict['percent_renter_occupied'], census_begin_year],
        "Rent Price Ratio":[demo_dict['rent_price_ratio'], census_begin_year],
        "Total Employed":[demo_dict['total_employed'], census_begin_year],
    },
    rank_city=True
)


In [None]:
def make_ranking_part_2(
    df_dict,
    rank_msa=False,
    rank_city=False,
    list_of_msas=False,
    max_price=False,
    min_rent_to_price=False,
    use_total_trend=True,
    use_average_percent=True,
    total_trend_weight_dict={},
    average_percent_weight_dict={},
):
    
    ### STEP 1: RANK  
    
    # Define geo_name
    if rank_msa:
        geo_name = 'msa_name'
        geo_file = 'msa'
    elif rank_city:
        geo_name = 'city_name'
        geo_file = 'city'
    else:
        raise ValueError("Please define MSA or City in the arguments.")
        
    # Read in the coefficient dataframe
    coef_df = pd.read_csv("datasets/cleaned_census_api_files/city_data/city_trend_datasets/all_city_demographic_trends.csv",
                         dtype={'city_geoid':str})
    
    # Filter coef_df by the list of MSAs we want to analyze
    if list_of_msas:
        coef_df = coef_df[
            coef_df['msa_name'].isin(list_of_msas)].reset_index(drop=True)
    
    # Drop MSAs that have missing values (they will have missing
    # values if we couldn't join Census MSAs with BLS MSAs which
    # only occurs for a few specific MSAs)
    bad_msa = set()
    
    for demo in df_dict:
        
        # Filter by nulls
        coef_temp = coef_df[coef_df[f'trend_coef_{demo}'].isnull()]
        
        # Get list of MSAs
        bad_msa.update(coef_temp[geo_name].unique())
        
    # Remove these cities Print helpful message
    if len(bad_msa) > 0:
        
        # Remove cities in bad_msa
        coef_df = coef_df[~coef_df[geo_name].isin(bad_msa)].reset_index(drop=True)
        
        print(f"Removing these MSAs not fit for analysis: {bad_msa}")
        
        # Check if the dataframe is now empty
        if coef_df.shape[0] == 0:
            raise ValueError("Dataframe is now empty due to filtering. Please double check.")
        
    # Create the rankings for each demographic
    for demo in df_dict:
        
        # Calculate rankings for both, then sort by the total
        # ranking. For example, if a city has the highest average
        # percent change, it will get a ranking of "1" for average_pct,
        # and if it has the 8th highest trend coefficient, it will
        # get a ranking of "8" for trend_coef. When we add those two
        # rankings together, the city will have a total ranking
        # of "9". In this case, the lower the ranking, the better,
        # and we will sort total rankings from lowest to highest.
        
        # Normalize total trend column
        coef_df[f'normalized_trend_coef_{demo}'] = normalize_column(
            coef_df[f'trend_coef_{demo}'], min_max_standardized=True)
        
        # Normalize avg pct column
        coef_df[f'normalized_average_pct_{demo}'] = normalize_column(
            coef_df[f'average_pct_{demo}'], min_max_standardized=True)
        
        # Check to see if there are weights, and if not,
        # set each weight to 1
        if demo in total_trend_weight_dict.keys():
            trend_weight = total_trend_weight_dict[demo]
        else:
            trend_weight = 1
            
        # Check pct weight dict
        if demo in average_percent_weight_dict.keys():
            pct_weight = average_percent_weight_dict[demo]
        else:
            pct_weight = 1
            
        # Re-adjust weights based on whether we are using
        # only total trend, only percent, or both. As an example, 
        # if we aren't using percent, we set the weight to 0, that
        # way the percent weight isn't used when totalling the
        # demographic's weight.
        if use_total_trend == False:
            trend_weight = 0
        if use_average_percent == False:
            pct_weight = 0
        
        # Create weights
        coef_df[f'{demo}_weight'] = (
            (coef_df[f'normalized_trend_coef_{demo}'] * trend_weight) 
            + (coef_df[f'normalized_average_pct_{demo}'] * pct_weight)
        )

    # Make final total rank column by adding up
    # all demo total rankings
    coef_df['total_weight'] = 0
    for demo in df_dict:
        coef_df['total_weight'] += coef_df[f'{demo}_weight']

    # Sort by total weight, highest to lowest
    final_df = coef_df.sort_values(
        'total_weight', ascending=False).reset_index(drop=True)
    
    # Merge median rent, price, and income to final df
    for demo in [
        'population',
        'median_income',
        'median_price',
        'median_rent',
        'people_per_unit',
        'percent_renter_occupied',
        'rent_price_ratio',
        'total_employed',
    ]:
                
        # Call in demographic dataset
        demo_df = pd.read_csv(
            f"datasets/cleaned_census_api_files/{geo_file}_data/{demo}_{geo_file}.csv",
            dtype={'msa_code':str, 'geo_id':str}
        )

        # Run prep function to get into correct format
        demo_df = prep_census_datasets(demo_df, msa=rank_msa, city=rank_city)

        # Get most recent year for median price
        recent_year = demo_df['year'].max()

        # Filter by recent_year
        recent_year_df = demo_df[demo_df['year']==recent_year].copy()        
        
        # Only keep certain columns
        recent_year_df = recent_year_df[['city_geoid','value']]
        
        # Rename column
        recent_year_df.rename(columns={'value':f'{demo}'}, inplace=True)
        
        # Merge to final_df
        final_df = final_df.merge(recent_year_df, how='left', on='city_geoid')
        
    # Merge jobs
    jobs = pd.read_csv('datasets/bls/raw/most_recent_bls_data.csv',
                   dtype={'msa_code':str, 'state_code':str})

    # Make sure the date column is in datetime format
    jobs['date'] = pd.to_datetime(jobs['date'])

    # Replace NECTA Division
    jobs['msa_name'] = jobs['msa_name'].apply(lambda x: x.replace(" NECTA Division",""))
    jobs['msa_name'] = jobs['msa_name'].apply(lambda x: x.replace(" NECTA",""))

    # Get most recent job date
    recent_date = jobs['date'].max()

    # Filter jobs so it's the most recent date
    new_jobs = jobs[jobs['date']==recent_date].copy().reset_index(drop=True)
    
    # Call in the msa-to-city crosswalk file
    crosswalk = pd.read_csv("datasets/helper_datasets/msa_to_city_crosswalk.csv",
                           dtype={'msa_geoid':str, 'city_geoid':str})
    
    # Only keep IDs
    crosswalk = crosswalk[['msa_geoid','city_geoid']]
    
    # Rename jobs' msa column
    new_jobs.rename(columns={'msa_code':'msa_geoid'}, inplace=True)
    
    # Merge crosswalk to job data
    new_jobs = new_jobs.merge(crosswalk, 
                              how='left', 
                              on='msa_geoid')

    # Only keep certain columns
    new_jobs = new_jobs[['city_geoid','value']]
    
    # Rename column
    new_jobs.rename(columns={'value':f'jobs_in_msa'}, inplace=True)

    # Merge to final_df
    final_df = final_df.merge(new_jobs, 
                              how='left', 
                              on='city_geoid')
    
    # If max price, filter it
    if max_price:
        final_df = final_df[final_df['median_price']<=max_price].reset_index(drop=True)
        
    # If min rent-price ratio, filter
    if min_rent_to_price:
        final_df = final_df[final_df['rent_price_ratio']>=min_rent_to_price].reset_index(drop=True)
        
    ### STEP 2: FILTER
    
    # Capture the growth ranking
    final_df['growth_ranking'] = final_df.index + 1
    
    # Filter final_df by rent-price-ration and then percent-renter-occupied
    final_df = final_df.sort_values(
        ['rent_price_ratio','percent_renter_occupied'],
        ascending=[False, False]
    ).reset_index(drop=True)
        
    # If list of MSAs, save ranking so we can
    # analyze in QGIS later
    if list_of_msas:
        
        # Create folder
        folder_save = "datasets/cleaned_census_api_files/city_data/city_rankings"
        create_folder(folder_save)
        
        # Get variable name for folder and file
        var_string = "_".join(list_of_msas)
        var_string += "_rankings.csv"
        
        # Save file
        final_df.to_csv(f"{folder_save}/{var_string}", index=False)
    
            
    return final_df

### Begin Ranking Cities

In [None]:
# Define standard ranking arguments
standard_rank_kwargs = {
    'df_dict':{
        "Population":[demo_dict['population'], census_begin_year],
        "Median Income":[demo_dict['median_income'], census_begin_year],
        "Median Price":[demo_dict['median_price'], census_begin_year],
        "Median Rent":[demo_dict['median_rent'], census_begin_year],
    },
    'rank_city':True,
    'use_total_trend':True,
    'use_average_percent':True,
    'total_trend_weight_dict':{
        "Population":0.75,
        "Median Rent":0.5,
        "Median Income":0.25,
        "Median Price":0
    },
    'average_percent_weight_dict':{
        "Population":4,
        "Median Rent":3,
        "Median Income":2,
        "Median Price":1
    }
}


In [None]:
# Here is an example of using all arguments 
# without standard ranking kwargs
Greenville_MSA = make_ranking_part_2(
    df_dict={
        "Population":[demo_dict['population'], census_begin_year],
        "Median Income":[demo_dict['median_income'], census_begin_year],
        "Median Price":[demo_dict['median_price'], census_begin_year],
        "Median Rent":[demo_dict['median_rent'], census_begin_year],
    },
    rank_city=True,
    list_of_msas=['Greenville-Anderson, SC'],
   use_total_trend=True,
    use_average_percent=True,
    total_trend_weight_dict={
        "Population":0.75,
        "Median Rent":0.5,
        "Median Income":0.25,
        "Median Price":0,
    },
    average_percent_weight_dict={
        "Population":4,
        "Median Rent":3,
        "Median Income":2,
        "Median Price":1,
    }
)
print(Greenville_MSA.shape)
Greenville_MSA.head(5)


In [None]:
## Here is an example of using the standard ranking keyword arguments
greenville_test = make_ranking_part_2(
    list_of_msas=['Greenville-Anderson, SC'],
    **standard_rank_kwargs
)
greenville_test.head(5)

In [None]:
### Make rankings for top MSAs
Dallas_MSA = make_ranking_part_2(
    list_of_msas=['Dallas-Fort Worth-Arlington, TX'],
    **standard_rank_kwargs
)

Charleston_MSA = make_ranking_part_2(
    list_of_msas=['Charleston-North Charleston, SC'],
    **standard_rank_kwargs
)

Charlotte_MSA = make_ranking_part_2(
    list_of_msas=['Charlotte-Concord-Gastonia, NC-SC'],
    **standard_rank_kwargs
)

Athens_MSA = make_ranking_part_2(
    list_of_msas=['Athens-Clarke County, GA'],
    **standard_rank_kwargs
)

Nashville_MSA = make_ranking_part_2(
    list_of_msas=['Nashville-Davidson--Murfreesboro--Franklin, TN'],
    **standard_rank_kwargs
)

Atlanta_MSA = make_ranking_part_2(
    list_of_msas=['Atlanta-Sandy Springs-Alpharetta, GA'],
    **standard_rank_kwargs
)

Austin_MSA = make_ranking_part_2(
    list_of_msas=['Austin-Round Rock-Georgetown, TX'],
    **standard_rank_kwargs
)

Raleigh_MSA = make_ranking_part_2(
    list_of_msas=['Raleigh-Cary, NC'],
    **standard_rank_kwargs
)

Orlando_MSA = make_ranking_part_2(
    list_of_msas=['Orlando-Kissimmee-Sanford, FL'],
    **standard_rank_kwargs
)

Tampa_MSA = make_ranking_part_2(
    list_of_msas=['Tampa-St. Petersburg-Clearwater, FL'],
    **standard_rank_kwargs
)

Knoxville_MSA = make_ranking_part_2(
    list_of_msas=['Knoxville, TN'],
    **standard_rank_kwargs
)

Sherman_MSA = make_ranking_part_2(
    list_of_msas=['Sherman-Denison, TX'],
    **standard_rank_kwargs
)

Birmingham_MSA = make_ranking_part_2(
    list_of_msas=['Birmingham-Hoover, AL'],
    **standard_rank_kwargs
)

Los_Angeles_MSA = make_ranking_part_2(
    list_of_msas=['Los Angeles-Long Beach-Anaheim, CA'],
    **standard_rank_kwargs
)

Phoenix_MSA = make_ranking_part_2(
    list_of_msas=['Phoenix-Mesa-Chandler, AZ'],
    **standard_rank_kwargs
)


### Define Plots


In [None]:
# Loop through all cities, sort by coefficient, plot top 10
def plot_msa_cities(
    cities_in_MSA,
    just_city=True,
    plot_population=False,
    plot_rent=False,
    plot_income=False,
    plot_price=False,
    plot_units=False,
    plot_rent_to_price=False,
    plot_people_per_unit=False,
    plot_percent_renter_occupied=False,
    plot_total_employed=False,
    begin_year_1=2013,
    plot_all=False,
):
    """
    Plots the top cities for a given demographic. Top cities
    are chosen based on their trend. This function can also 
    find the top cities based on multiple datasets.
    
    Arguments
    ----------
        ranked_cities (DataFrame): A dataframe of cities
            already ranked by various demographics. The
            dataframe returned by the "make_ranking()"
            function is the ideal dataframe to pass
            to this function.
            
        plot_jobs (True/False): If Ture, plot jobs. Only one 
            demographic can be plotted at a time, so if you'd 
            like to plot a different demographic, this must 
            be set to False.
        
        plot_rent (True/False): If Ture, plot rent. Only one 
            demographic can be plotted at a time, so if you'd 
            like to plot a different demographic, this must 
            be set to False.
        
        plot_income (True/False): If Ture, plot income. Only one 
            demographic can be plotted at a time, so if you'd 
            like to plot a different demographic, this must 
            be set to False.
        
        plot_price (True/False): If Ture, plot price. Only one 
            demographic can be plotted at a time, so if you'd 
            like to plot a different demographic, this must 
            be set to False.
        
        plot_units (True/False): If Ture, plot units. Only one 
            demographic can be plotted at a time, so if you'd 
            like to plot a different demographic, this must 
            be set to False.
        
        plot_rent_to_price (True/False): If Ture, plot rent-ro-price
            ratio. Only one demographic can be plotted at a time, 
            so if you'd like to plot a different demographic, 
            this must be set to False.
        
        plot_jobs_per_unit (True/False): If Ture, plot jobs-per-unit. 
            Only one demographic can be plotted at a time, so if 
            you'd like to plot a different demographic, this must 
            be set to False.
            
        begin_year_1 (int): The year you'd like the
            analysis to start.
            
        plot_all (True/False): True if you want to plot every
            city in the dataframe (slow). False if you only want
            to plot the top 5 (fast).
    
    Returns
    ----------
        coef_df (DataFrame): A dataframe with the rankings
            of each city, from "best to worst."
    
    """
    # Make a copy of the ranked cities
    ranked = cities_in_MSA.copy()
    
    # Create folders to save graphs into
    create_folder("graphs")
    create_folder("graphs/city_graphs_by_MSA")
    
    ### Call in the dataset we will be graphing from
    
    # Population 
    if plot_population:
        
        # Set demographic title for graphs
        demographic_1="Population"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            "datasets/cleaned_census_api_files/city_data/population_city.csv",
            dtype={'geo_id':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, city=just_city)
    
    # If plotting rent growth
    elif plot_rent:
        
        # Set demographic title for graphs
        demographic_1="Median Rent"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            "datasets/cleaned_census_api_files/city_data/median_rent_city.csv",
            dtype={'geo_id':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, city=just_city)
        
    # If plotting income growth
    elif plot_income:
        
        # Set demographic title for graphs
        demographic_1="Median Income"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            "datasets/cleaned_census_api_files/city_data/median_income_city.csv",
            dtype={'geo_id':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, city=just_city)
        
    # If plotting price growth
    elif plot_price:
        
        # Set demographic title for graphs
        demographic_1="Median Price"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            "datasets/cleaned_census_api_files/city_data/median_price_city.csv",
            dtype={'geo_id':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, city=just_city)
        
    # If plotting unit growth
    elif plot_units:
        
        # Set demographic title for graphs
        demographic_1="Total Units"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            "datasets/cleaned_census_api_files/city_data/total_units_city.csv",
            dtype={'geo_id':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, city=just_city)
        
    # If plotting rent-to-price
    elif plot_rent_to_price:
        
        # Set demographic title for graphs
        demographic_1="Rent-to-Price"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            "datasets/cleaned_census_api_files/city_data/rent_price_ratio_city.csv",
            dtype={'geo_id':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, city=just_city)
        
    elif plot_people_per_unit:
        
        # Set demographic title for graphs
        demographic_1="People-per-Unit"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            "datasets/cleaned_census_api_files/city_data/people_per_unit_city.csv",
            dtype={'geo_id':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, city=just_city)
        
    elif plot_percent_renter_occupied:
        
        # Set demographic title for graphs
        demographic_1="Percent Renter Occupied"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            "datasets/cleaned_census_api_files/city_data/percent_renter_occupied_city.csv",
            dtype={'geo_id':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, city=just_city)
        
    elif plot_total_employed:
        
        # Set demographic title for graphs
        demographic_1="Total Employed"
        
        # Read in data
        dataframe_1 = pd.read_csv(
            "datasets/cleaned_census_api_files/city_data/total_employed_city.csv",
            dtype={'geo_id':str})

        # Run prep function to get into correct format
        dataframe_1 = prep_census_datasets(dataframe_1, city=just_city)

    # Otherwise, print error statement
    else:
        print("Please specify a demographic to plot by setting it to True, and leaving the others set to False.")
        raise Exception("SPECIFY A DEMOGRAPHIC TO PLOT.")
    
    # Make copy
    main_df = dataframe_1.copy()
            
    # Create main variable to use for the rest of the script
    column = 'value'

    # Create dictionary to store filtered dataframes
    filtered_dict = {}

    # Set y_lim list to find max and min
    y_lim_list_trend = []
    y_min_list_trend = [0]
    y_lim_list_pct = []
    y_min_list_pct = [0]
    
    # Loop through all cities in the ranked dataframe
    for city in ranked['city_name'].dropna().unique():

        # Isolate just that city
        df = main_df[main_df['city_name']==city].copy()
        
        # Get MSA name
        msa_name = df['msa_name'].iloc[0]

        # Create folders to save graphs into
        create_folder(f"graphs/city_graphs_by_MSA/{msa_name}")
        
        # Sort by date
        df = df.sort_values('date')
                
        # Create difference column
        df['value_change'] = df['value'].diff()
        
        # Create pct_change column
        df['percent_change'] = df['value'].pct_change()
        
        # Filter by beginning year
        df = df[df['year']>=begin_year_1].reset_index(drop=True)
                
        # If an MSA's most recent year is after the beginning
        # year, remove it from the graphs. For example, if we want to
        # view the growth of all cities since 2016, but Prescott Valley
        # only has data starting at 2019, this may skew the data.
        if df['year'].iloc[0] != begin_year_1:
            print(f"Dropping {city}, it's dataframe has a smaller window.")
            continue
                
        # Remove NaN values
        df = df[df['percent_change'].notna()]
        
        # Isolate date and value columns
        df = df[[
            'msa_name',
            'date', 'value', 'value_change',
            'percent_change']].reset_index(drop=True)
        
        # Add this dataframe's y_lim to list
        y_lim_list_trend.append(df['value'].max())
        y_min_list_trend.append(df['value'].min())
        y_lim_list_pct.append(df['percent_change'].max())
        y_min_list_pct.append(df['percent_change'].min())
        
        # get next months's datetime
        next_year = df['date'].iloc[-1] + relativedelta(months=1)
        
        # Create ordinal column
        df['ordinal_date'] = df['date'].map(datetime.toordinal)
        
        # Run linear regression and get the trend's coefficient
        coef_value, intercept_value = run_lr(df, column='value')
        coef_pct, intercept_pct = run_lr(df, column='percent_change')
        
        # Create next year's date
        df.loc[len(df.index)] = [
            msa_name,
            next_year, np.nan, np.nan, 
            np.nan, datetime.toordinal(next_year)]
        
        # Create averages column
        the_average_pct = df['percent_change'].mean()
        df['average_pct'] = the_average_pct
        the_average_value = df['value_change'].mean()
        df['average_value'] = the_average_value

        # Fill in with linear regression values.
        # Also add highest trend value to lim_list.
        df['value_trend'] = df['ordinal_date']*coef_value + intercept_value
        df['percent_change_trend'] = df['ordinal_date']*coef_pct + intercept_pct

        # Also add highest trend value to lim_list
        y_lim_list_trend.append(df['value_trend'].max())
        y_lim_list_pct.append(df['percent_change_trend'].max())

        # Get the y_lim
        y_lim_trend = max(y_lim_list_trend) * 1.1
        y_min_trend = min(y_min_list_trend)
        y_lim_pct = max(y_lim_list_pct) * 1.1
        y_min_pct = min(y_min_list_pct)
            
        # Save filtered data to dictionary
        filtered_dict[city] = df
    
    # Loop through each city in the ranked df
    for city_name in ranked['city_name']:

        # Get the job data
        df = filtered_dict[city_name].copy()
        
        # Get msa_name
        msa_name = df['msa_name'].iloc[0]
            
        # Make a grid to plot 2 graphs on
        fig = plt.figure(figsize=(12,3), dpi=300)
        gs = GridSpec(nrows=1, ncols=2)
        ax1 = fig.add_subplot(gs[0,0])
        ax2 = fig.add_subplot(gs[0,1])
        ax_list = [ax1, ax2]
        
        # Set grid
        plt.style.use('seaborn-whitegrid')

        # Set title
        fig.suptitle(f"{city_name}\n\n\n", 
             fontweight="bold")

        # Plot first graph
        ax1 = df.plot(x='date',y='value', ax=ax1)
        ax1 = df.plot(x='date',y='value_trend', ax=ax1, linestyle="--")

        # Plot second graph
        ax2 = df.plot(x='date',y='percent_change', ax=ax2)
        ax2 = df.plot(x='date',y='percent_change_trend', ax=ax2, linestyle="--")

        # Second graph's zero line
        df['zero'] = 0
        ax2 = df.plot(x='date', y='zero', ax=ax2, color="grey")

        # Also plot the average line
        ax2 = df.plot(x='date', y='average_pct', 
                      ax=ax2, color="black", linestyle="-")

        # Set title's for both graphs
        ax1.set_title(f"{demographic_1} Growth")
        ax2.set_title(f"Percent Change in {demographic_1}")

        # Set y lims and y ticks
        ax1.set_ylim([y_min_trend, y_lim_trend])
        ax2.set_ylim([y_min_pct, y_lim_pct])

        # Set y limits
        y_tick_list_trend = [
            y_lim_trend*0.25, y_lim_trend*0.5, 
            y_lim_trend*0.75, y_lim_trend]
        y_tick_list_pct = [
            y_min_pct, y_min_pct*0.5, 0, 
            y_lim_pct*0.5, y_lim_pct]

        # Set y_ticks
        ax1.yaxis.set_major_locator(
            mticker.FixedLocator(y_tick_list_trend))
        ax2.yaxis.set_major_locator(
            mticker.FixedLocator(y_tick_list_pct))

        # Set y-tick labels
        ax1.set_yticklabels(
            ['{:,}'.format(round(float(x), 3)) for x in y_tick_list_trend])
        ax2.set_yticklabels(
            ['{:,}'.format(round(float(x), 3)) for x in y_tick_list_pct])
        
        # Give suptitle more room
        fig.subplots_adjust(top=0.85)

        # Create folder to save graphs into
        create_folder(f"graphs/city_graphs_by_MSA/{msa_name}/{city_name}")

        # Create filepath to save graph to
        save_filepath = f"graphs/city_graphs_by_MSA/{msa_name}/{city_name}/{demographic_1} Growth.png"

        # Save the graphs
        plt.savefig(save_filepath)
        
        # Show plot
        if not plot_all:
            plt.show()
        
        # Clear plot
        plt.close("all")
        


In [None]:
### Plot every demographic
def plot_every_demographic_cities(
    ranking_df, 
    begin_job_year=begin_job_year, 
    census_begin_year=census_begin_year,
    plot_all=False
):
    
    # Plot every demographic 
    demo_kwargs = {
        'plot_population':False,
        'plot_rent':False,
        'plot_income':False,
        'plot_price':False,
        'plot_units':False,
        'plot_rent_to_price':False,
        'plot_people_per_unit':False,
        'plot_percent_renter_occupied':False,
        'plot_total_employed':False,
    }

    # Loop through each demo argument
    for demo_arg in demo_kwargs:

        # Make a copy
        demo_kwargs_copy = demo_kwargs.copy()

        # Make the demo set to True
        demo_kwargs_copy[demo_arg] = True

        # Plot
        plot_msa_cities(
            cities_in_MSA=ranking_df,
            begin_year_1=census_begin_year,
            plot_all=plot_all,
            **demo_kwargs_copy
        )
    


In [None]:
# Here to test Greenville
plot_msa_cities(
    cities_in_MSA=Greenville_MSA,
    plot_population=True,
    plot_rent=False,
    plot_income=False,
    plot_price=False,
    plot_units=False,
    plot_rent_to_price=False,
    plot_people_per_unit=False,
    plot_percent_renter_occupied=False,
    plot_total_employed=False,
    begin_year_1=2016,
)

### Plot Actual Maps of Cities in MSA!

In [None]:
def plot_msa_cities_map(
    cities_in_MSA,
    using_list_of_MSAs=False
):
    
    # Make copy
    if not using_list_of_MSAs:
        df = cities_in_MSA.copy()
        
    # If there is a list of MSAs close to each
    # other, we can map them. Just concat the
    # dataframes.
    elif using_list_of_MSAs:
        
        # Make new empty df
        df = pd.DataFrame(data=None, columns=cities_in_MSA[0].columns)
        
        # Loop through the list and concat
        for dataframe in cities_in_MSA:
            
            new_df = dataframe.copy()
            df = pd.concat([df, new_df])
            
        # Reset index
        df = df.reset_index(drop=True)
        
    
    # Call in geometry file
    city_geom = gp.read_file(
        "datasets/census_original_files/census_geopackages/city_geometries.gpkg",
        dtype={'GEOID':str})
    
    # Rename city geoid column
    city_geom.rename(columns={'GEOID':'city_geoid'}, inplace=True)
    
    # Get number of rows of df
    num_rows = df.shape[0]
    
    # Merge geometries
    new_df = df.merge(city_geom, how="inner", on="city_geoid")
    
    # Get number of rows of new dataframe
    new_num_rows = new_df.shape[0]
    
    # Print statement if cities were lost in the merge
    if num_rows != new_num_rows:
        
        # Get set of original cities
        og_cities = set(df['city_name'])
        
        # Get set of merged cities
        merged_cities = set(new_df['city_geoid'])
        
        # Get cities left out
        cities_left_out = og_cities - merged_cities
        
        # Print cities left out
        print(f'The following cities were not included in the merge: {cities_left_out}')
    
    # Ensure the new_df is a geodataframe
    new_df = gp.GeoDataFrame(new_df)
    
    # Enforce a standard crs
    new_df = new_df.to_crs(epsg=3857)
    
    # get MSA name
    msa_name = new_df['msa_name'].iloc[0]

    # Create folder to save graphs into
    create_folder(f"graphs/city_graphs_by_MSA")
    create_folder(f"graphs/city_graphs_by_MSA/{msa_name}")
    create_folder(f"graphs/city_graphs_by_MSA/{msa_name}/maps")

    def create_map(
        map_name="Basemap",
        plot_price=False,
        plot_population=False,
        plot_rent=False,
        plot_income=False,
        plot_rent_to_price=False,
        plot_people_per_unit=False,
        plot_percent_renter_occupied=False,
        plot_total_employed=False,
        plot_avg_percent_population=False,
        plot_avg_percent_median_rent=False,
    ):
        
        # Adjust the figure's figsize based on
        # how many cities there are
        if new_df.shape[0] <= 10:
            fig_xy = 10
        elif new_df.shape[0] <= 20:
            fig_xy = 15
        elif new_df.shape[0] <= 30:
            fig_xy = 20
        elif new_df.shape[0] <= 40:
            fig_xy = 25
        else:
            fig_xy = 30

        # Create figure
        fig, ax = plt.subplots(figsize=(fig_xy,fig_xy))

        # Plot the towns
        new_df.boundary.plot(ax=ax)

        # Plot basemap
        cx.add_basemap(ax, alpha=0.4)

        # Place labels inside the geometries. More options can
        # be found in the documentation here:
        # https://matplotlib.org/stable/tutorials/text/annotations.html
        new_df.apply(
            lambda x: ax.annotate(text=x['NAME'], 
            xy=x.geometry.centroid.coords[0], 
            ha='center', size=8, bbox=dict(
                boxstyle="round,pad=0.3", fc="white", 
                ec="black", lw=1)), axis=1)

        # Establish add_plot variable to only change
        # if we aren't plotting a basemap
        add_plot = False
        adjust_format = False

        # If plotting, add details into map
        if plot_price:
            the_column="median_price"
            cmap="Reds"
            add_plot=True
            map_name="Median Price"
        elif plot_population:
            the_column="population"
            cmap="Blues"
            add_plot=True
            map_name="Population"
        elif plot_rent:
            the_column="median_rent"
            cmap="Greens"
            add_plot=True
            map_name="Median Rent"
        elif plot_income:
            the_column="median_income"
            cmap="Reds"
            add_plot=True
            map_name="Median Income"
        elif plot_rent_to_price:
            the_column="rent_price_ratio"
            cmap="Blues"
            add_plot=True
            map_name="Rent-to-Price Ratio"
            adjust_format=True
        elif plot_people_per_unit:
            the_column="people_per_unit"
            cmap="Greens"
            add_plot=True
            map_name="People-per-Unit"
        elif plot_percent_renter_occupied:
            the_column="percent_renter_occupied"
            cmap="Reds"
            add_plot=True
            map_name="Percent Renter-Occupied"
            adjust_format=True
        elif plot_total_employed:
            the_column="total_employed"
            cmap="Blues"
            add_plot=True
            map_name="Total Employed"
        elif plot_avg_percent_population:
            the_column="average_pct_Population"
            cmap="Reds"
            add_plot=True
            map_name="Avg. Population Growth"
            adjust_format=True
        elif plot_avg_percent_median_rent:
            the_column="average_pct_Median Rent"
            cmap="Greens"
            add_plot=True
            map_name="Avg. Rent Growth"
            adjust_format=True

        # Establish legend keywords if adding plot
        if add_plot:
            legend_kwds={
                    'frameon':True,
                    'title':f'{map_name}',
                    'loc': 'center left', 
                    'bbox_to_anchor':(1,0.5)}

            # Further adjust for rent-price-ratio
            if adjust_format:
                legend_kwds['fmt'] = '{:.2%}'


        # Add new plot if not basemap!
        if add_plot:

            new_df.plot(
                column=the_column, ax=ax, 
                legend=True, cmap=cmap, scheme="quantiles",
                legend_kwds=legend_kwds)

        # Set title
        plt.title(f"{msa_name} MSA - {map_name}", fontweight="bold")

        # Congifure axes
        ax.set_axis_off()

        # Create filepath to save basemap
        save_filepath = f"graphs/city_graphs_by_MSA/{msa_name}/maps/{map_name}_{msa_name}.png"

        # Save basemap
        plt.savefig(save_filepath, bbox_inches="tight")

        # Show plot, then clear
        plt.show()
        plt.close('all')


    # Create all maps
    create_map()
    create_map(plot_price=True)
    create_map(plot_population=True)
    create_map(plot_rent=True)
    create_map(plot_income=True)
    create_map(plot_rent_to_price=True)
    create_map(plot_people_per_unit=True)
    create_map(plot_percent_renter_occupied=True)
    create_map(plot_total_employed=True)
    create_map(plot_avg_percent_population=True)
    create_map(plot_avg_percent_median_rent=True)
    
    return df



In [None]:
# Make list of top MSAs
top_msa_list = [
    Greenville_MSA, Dallas_MSA, Charleston_MSA, Charlotte_MSA,
    Athens_MSA, Nashville_MSA, Atlanta_MSA, Austin_MSA,
    Raleigh_MSA, Orlando_MSA, Tampa_MSA, Knoxville_MSA,
    Sherman_MSA, Birmingham_MSA, Los_Angeles_MSA, Phoenix_MSA
]

In [None]:
### Make a dataframe with every city in our list
all_cities_in_top_msas = pd.concat(top_msa_list).copy().reset_index(drop=True)
all_cities_in_top_msas

In [None]:
# Plot Georgia's MSAs
Georgia_MSA = plot_msa_cities_map(
    [Atlanta_MSA, Athens_MSA],
    using_list_of_MSAs=True
)

In [None]:
# Testing Double MSA
Georgia_MSA[
    (Georgia_MSA['percent_renter_occupied']>0.3)
    & (Georgia_MSA['median_price']<250000)
].sort_values('average_pct_Median Income', ascending=False)


# Georgia_MSA

In [None]:
# Plot Dallas with Sherman-Denison
North_Texas_MSA = plot_msa_cities_map(
    [Dallas_MSA, Sherman_MSA],
    using_list_of_MSAs=True
)

In [None]:
# Testing Double MSA
North_Texas_MSA[
    (North_Texas_MSA['percent_renter_occupied']>0.3)
    & (North_Texas_MSA['median_price']<250000)
].sort_values('rent_price_ratio', ascending=False)


# North_Texas_MSA


In [None]:
# Plot maps of our top MSAs
for msa in top_msa_list:
    plot_msa_cities_map(msa)
    

In [None]:
Charleston_MSA[
    (Charleston_MSA['percent_renter_occupied']>0.3)
    & (Charleston_MSA['median_price']<250000)
].sort_values('average_pct_Median Income', ascending=False)

Charleston_MSA.sort_values('average_pct_Median Rent', ascending=False)

In [None]:
all_cities_in_top_msas[all_cities_in_top_msas['city_name'].str.contains("San Marcos")]

In [None]:
### Isolate specific cities to comapre against each other
specific_city_list = """McDonough|Fairburn|Villa Rica|North Charleston|Summerville|Sherman|Denison|San Marcos"""

top_cities = all_cities_in_top_msas[
    all_cities_in_top_msas['city_name'].str.contains(specific_city_list)
].copy().reset_index(drop=True)

top_cities


In [None]:
# Plot demographics for specific cities
plot_every_demographic_cities(top_cities)

In [None]:
### Run stats on top_cities
top_cities.sort_values('rent_price_ratio', ascending=False)
