In [85]:
import pandas as pd
import os
import numpy as np
import seaborn as sns 
from matplotlib import pyplot as plt 


In [86]:

def find_repo_root(start_path):
    current_path = os.path.abspath(start_path)
    
    while True:
        # Check for the existence of the .git directory or other indicators
        if os.path.isdir(os.path.join(current_path, '.git')) or \
           os.path.isfile(os.path.join(current_path, 'README.md')):
            current_path = current_path.replace('\\', '/')
            return current_path
        
        parent_path = os.path.dirname(current_path)
        
        # Stop if we reach the root directory
        if parent_path == current_path:
            break
        
        current_path = parent_path

    return None  # Return None if not found

root = find_repo_root(os.getcwd())
root

'c:/Users/fitsl/Documents/Programming/UVM Programming Classes/Data Science I/Project'

# Download, analyze, create categories

In [87]:
df1 = pd.read_parquet(f"{root}/Data/Storm events/events_db_chunk_1.parquet")
df2 = pd.read_parquet(f"{root}/Data/Storm events/events_db_chunk_2.parquet")
df3 = pd.read_parquet(f"{root}/Data/Storm events/events_db_chunk_3.parquet")
df4 = pd.read_parquet(f"{root}/Data/Storm events/events_db_chunk_4.parquet")

df = pd.concat([df1, df2, df3, df4])
df['State FIPS'] = df['State FIPS'].apply(lambda x: f"{int(x):02d}")
df['County/Zone FIPS'] = df['County/Zone FIPS'].apply(lambda x: f"{int(x):03d}")
df['FIPS'] = df['State FIPS'] + df['County/Zone FIPS']
df = df[df['Year'] >= 2000]

In [88]:
df['Property Damage'].value_counts()
def convert_to_number(value):
    if pd.isna(value):
        return 0
    value = value.strip().lower()
    if value == 'k':
        return 1000
    elif value.endswith('k'):
        return int(float(value[:-1]) * 1000)
    elif value.endswith('m'):
        return int(float(value[:-1]) * 1000000)
    elif value.endswith('h'):
        return int(float(value[:-1]) * 100)
    elif value.endswith('b'):
        return int(float(value[:-1]) * 1000000000)
    else:
        return 0
    

def fk_log10(num):
    if num == 0: return 0
    else: return np.log10(num)
    

In [89]:
df['property_num'] = df['Property Damage'].map(lambda x : convert_to_number(x))
df['deaths'] = df['Direct Deaths'] + df['Indirect Deaths']


In [90]:
df['property_num'].value_counts()

property_num
0         989218
5000       38703
1000       36127
10000      32810
2000       25898
           ...  
933000         1
469000         1
97500          1
4600           1
489000         1
Name: count, Length: 2002, dtype: int64

In [91]:
df.columns = df.columns.str.strip()
df.rename(columns={
        "State FIPS" : "STATE_FIPS",
        "County/Zone FIPS" : "COUNTY_FIPS"
    },
    inplace=True
)

In [92]:
print(df.columns)


Index(['OBJECTID', 'State', 'Month', 'Event Type', 'Begin Date Time',
       'Property Damage', 'Begin Lat', 'Begin Lon', 'Episode Narrative',
       'Event Narrative', 'Lat/Lon Known', 'STATE_FIPS', 'Year', 'COUNTY_FIPS',
       'County/Zone Type', 'Event ID', 'Direct Injuries', 'Indirect Injuries',
       'Indirect Deaths', 'Direct Deaths', 'Episode ID', 'Source',
       'Data Source', 'Tornado F Scale', 'County/Zone Name',
       'Total Injuries and Deaths', 'x', 'y', 'FIPS', 'property_num',
       'deaths'],
      dtype='object')


In [93]:
df_agg = df.groupby(['STATE_FIPS', 'COUNTY_FIPS', 'Year', 'County/Zone Name']).agg(
    deaths=('deaths', 'sum' ),
    property_damage=('property_num', 'sum')
)
df_agg.reset_index(inplace=True)

In [94]:
df_agg.sort_values(by='property_damage', ascending=False)

Unnamed: 0,STATE_FIPS,COUNTY_FIPS,Year,County/Zone Name,deaths,property_damage
48459,22,062,2005,ORLEANS,638,21481570000
9480,06,066,2018,NORTHEAST FOOTHILLS/SACRAMENTO VALLEY,86,17000000000
115029,48,201,2017,HARRIS,39,10001921000
114172,48,167,2017,GALVESTON,6,10000395500
16119,12,068,2005,COASTAL PALM BEACH,1,10000000000
...,...,...,...,...,...,...
139880,99,145,2010,VEGA BAJA,0,0
139879,99,145,2009,VEGA BAJA,0,0
139878,99,143,2019,VEGA ALTA,0,0
139877,99,143,2018,VEGA ALTA,0,0


In [95]:
soc_df = pd.read_csv(
    f"{root}/Data/Census/5_year_summary.csv",
    dtype= {
        'county': str,
        'state': str
    })
soc_df.rename(columns={
    'state' : 'STATE_FIPS',
    'county' : 'COUNTY_FIPS'}, 
    inplace=True)

soc_df.drop(columns=[
    'Unnamed: 0', ],
    inplace=True)
soc_df = soc_df


soc_df = soc_df[['STATE_FIPS', 'COUNTY_FIPS',
       'start_year', 'end_year', 
       ]]

In [96]:
soc_df

Unnamed: 0,STATE_FIPS,COUNTY_FIPS,start_year,end_year
0,45,001,2006,2010
1,45,001,2007,2011
2,45,001,2008,2012
3,45,001,2009,2013
4,45,001,2010,2014
...,...,...,...,...
32199,46,137,2011,2015
32200,46,137,2012,2016
32201,46,137,2013,2017
32202,46,137,2014,2018


In [97]:
periods = {}

num_periods = [(i, i+4) for i in range(2005, 2017)]
str_periods = [(str(i), str(i+4)) for i in range(2005, 2017)]

period_ranges_exclusive = [] 
period_ranges_inclusive = []

for period in num_periods:
    year = period[0]
    floor_2k = (2000, year)
    floor_3 =  (max(year - 3, 2000), year)
    floor_5=  (max(year - 5, 2000), year)
    floor_10= (max(year - 10, 2000), year)
    period_ranges_exclusive.append((floor_2k, floor_3, floor_5, floor_10))


    end_year = period[1]
    floor_2k = (2000, end_year)
    floor_3 =  (max(year - 3, 2000), end_year)
    floor_5=  (max(year - 5, 2000), end_year)
    floor_10= (max(year - 10, 2000), end_year)
    period_ranges_inclusive.append((floor_2k, floor_3, floor_5, floor_10))
    print(num_periods)

[(2005, 2009), (2006, 2010), (2007, 2011), (2008, 2012), (2009, 2013), (2010, 2014), (2011, 2015), (2012, 2016), (2013, 2017), (2014, 2018), (2015, 2019), (2016, 2020)]
[(2005, 2009), (2006, 2010), (2007, 2011), (2008, 2012), (2009, 2013), (2010, 2014), (2011, 2015), (2012, 2016), (2013, 2017), (2014, 2018), (2015, 2019), (2016, 2020)]
[(2005, 2009), (2006, 2010), (2007, 2011), (2008, 2012), (2009, 2013), (2010, 2014), (2011, 2015), (2012, 2016), (2013, 2017), (2014, 2018), (2015, 2019), (2016, 2020)]
[(2005, 2009), (2006, 2010), (2007, 2011), (2008, 2012), (2009, 2013), (2010, 2014), (2011, 2015), (2012, 2016), (2013, 2017), (2014, 2018), (2015, 2019), (2016, 2020)]
[(2005, 2009), (2006, 2010), (2007, 2011), (2008, 2012), (2009, 2013), (2010, 2014), (2011, 2015), (2012, 2016), (2013, 2017), (2014, 2018), (2015, 2019), (2016, 2020)]
[(2005, 2009), (2006, 2010), (2007, 2011), (2008, 2012), (2009, 2013), (2010, 2014), (2011, 2015), (2012, 2016), (2013, 2017), (2014, 2018), (2015, 2019), 

In [98]:
table_form_events = df_agg.copy()

for i, (start, end) in enumerate(str_periods):
    print("start, end", start, end)
    inputpath = f"{root}/Data/County/Summed_clean/summed_{start}_{end}.csv"
    folder = "/Data/County/County_sum_with_StormEvents_Climate"
    county_data = pd.read_csv(inputpath, dtype={
        "STATE_FIPS" : "str",
        "COUNTY_FIPS" :  "str"
    })

    def period_namer(i):
        if i == 0: return("From_2000")
        elif i == 1: return("Minus_3")
        elif i == 2: return("Minus_5")
        elif i == 3: return("Minus_10")
        else:
            raise("too many periods in period, bro")
        

    def group_period(df):
        result = (
        df.groupby(['STATE_FIPS', 'COUNTY_FIPS'])
        .agg(
            Population=('Population', 'mean'),
            Inflow_pc=('Inflow_pc', 'mean'),
            Inflow_gross=('Inflow_gross', 'mean'),
            Outflow_pc=('Outflow_pc', 'mean'),
            Outflow_gross=('Outflow_gross', 'mean'),
            deaths=('deaths', 'sum' ),
            property_damage=('property_damage', 'sum')
        )
        .reset_index()
        )
        return result


    #create data frames for all the sub-versions
    for x, (period_begin, period_end) in enumerate(period_ranges_exclusive[i]):
        # print("\n")
        subfolder = period_namer(x)
        # print(x, period_begin, period_end)
        # print(int(start)-4, int(start))
        
        soc_df_time = soc_df[(soc_df['start_year'] == int(start)) & (soc_df['end_year'] == int(end))].copy()
        # print(soc_df_time[['start_year', 'end_year']].head())

        time_span_merged = pd.merge(
            county_data,
            table_form_events[table_form_events['Year'].isin(range(period_begin, period_end+1))],
            on=['STATE_FIPS', 'COUNTY_FIPS']
        )


        time_span_merged = group_period(time_span_merged)
        time_span_merged = pd.merge(
            left=time_span_merged,
            right=soc_df_time,
            on=['STATE_FIPS', 'COUNTY_FIPS'],
            how='left'
        )
        save_path = f"{root}{folder}/Exclusive/{subfolder}/exclusive_merge_countysum_storm_{period_begin}_{period_end}.csv"
        time_span_merged.to_csv(save_path, index=False)

    for x, (period_begin, period_end) in enumerate(period_ranges_inclusive[i]):
        subfolder = period_namer(x)
        soc_df_time = soc_df[(soc_df['start_year'] == int(start)) 
                             & (soc_df['end_year'] == int(end))].copy()
        
        time_span_merged = pd.merge(
            county_data,
            table_form_events[table_form_events['Year'].isin(range(period_begin, period_end+1))],
            on=['STATE_FIPS', 'COUNTY_FIPS']
        )
        time_span_merged = group_period(time_span_merged)
        time_span_merged = pd.merge(
            left=time_span_merged,
            right=soc_df_time,
            on=['STATE_FIPS', 'COUNTY_FIPS'],
            how='left'
        )
        save_path = f"{root}{folder}/Inclusive/{subfolder}/inclusive_merge_countysum_storm_{period_begin}_{period_end}.csv"
        time_span_merged.to_csv(save_path, index=False)

start, end 2005 2009
start, end 2006 2010
start, end 2007 2011
start, end 2008 2012
start, end 2009 2013
start, end 2010 2014
start, end 2011 2015
start, end 2012 2016
start, end 2013 2017
start, end 2014 2018
start, end 2015 2019
start, end 2016 2020
