In [81]:
# Data Analysis - UFO Data Aggregateion - By Year, Month, State, County
# Version 1
## 1/13/25
## 18:45

In [1]:
# Importing Modules

# Import sys
import sys

# Import OS
import os

# Add the root directory to sys.path
sys.path.append(os.path.abspath(".."))

# Now you can use absolute imports
from d497_helpers import folder_manager as fm, archive_module, checkpoint_helper as CheckPoint, database_helper as db_tool, config

#### #### #### #### 

# Pandas
import pandas as pd

# Numpy
import numpy as np

# RegularExpression for string matching
import re



In [3]:
# function gets cleaned dataframe from specified folder and for specified file 
def get_cleaned_dataframe(data_source, file_name):
    
    global import_data_main_df_csv
    global import_data_main_df_pickle

    import_ufo_csv_dtypes = {
        "report_id" : "object",
        "year_code" : "object",
        "month_code" : "object",
        "state_fipcode" : "object",
        "county_fipcode" : "object",
        "city_fipcode" : "object",
        "fips_five" : "object",
        "fips_nine" : "object"
        }

    import_cdc_csv_dtypes = {
        "year_code" : "object",
        "month_code" : "object",
        "state_code" : "object",
        "county_code" : "object",
        "births" : "int"
        }

    if data_source == "ufo_data":
        folder_path = config.global_cleaned_ufo_data_folder_path
        csv_dtypes = import_ufo_csv_dtypes
    elif data_source == "cdc_data":
        folder_path = config.global_cleaned_cdc_data_folder_path
        csv_dtypes = import_cdc_csv_dtypes    
    else:
        folder_path = config.global_cleaned_fips_data_folder_path
        csv_dtypes = import_fips_csv_dtypes

    temp_file_name = file_name
    
    file_path_csv = folder_path + "\\" + temp_file_name + ".csv"
    file_path_pickle = folder_path + "\\" + temp_file_name  + ".pkl"
    
    temp_main_df_csv = pd.read_csv(file_path_csv, sep=",", dtype=csv_dtypes)
    temp_main_df_pickle = pd.read_pickle(file_path_pickle)

    import_data_main_df_csv = temp_main_df_csv.copy()
    import_data_main_df_pickle = temp_main_df_pickle.copy()
    
    del temp_main_df_csv
    del temp_main_df_pickle

In [4]:
# Initializes Database Engine and Session For SQL
db_tool.initialize_engine()
db_tool.initialize_session()

# Get ufo data from cleaned directory
get_cleaned_dataframe("ufo_data", "ufo_data_main_df")

In [5]:
# Getting csv import info
import_data_main_df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128175 entries, 0 to 128174
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   report_id       128175 non-null  object
 1   year_code       128175 non-null  object
 2   month_code      128175 non-null  object
 3   state_fipcode   128175 non-null  object
 4   county_fipcode  128175 non-null  object
 5   city_fipcode    128175 non-null  object
 6   fips_five       128175 non-null  object
 7   fips_nine       128175 non-null  object
dtypes: object(8)
memory usage: 7.8+ MB


In [6]:
# Getting pickle import info
import_data_main_df_pickle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128175 entries, 0 to 128174
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   report_id       128175 non-null  int64 
 1   year_code       128175 non-null  int64 
 2   month_code      128175 non-null  int64 
 3   state_fipcode   128175 non-null  object
 4   county_fipcode  128175 non-null  object
 5   city_fipcode    128175 non-null  object
 6   fips_five       128175 non-null  object
 7   fips_nine       128175 non-null  object
dtypes: int64(3), object(5)
memory usage: 7.8+ MB


In [7]:
# keeping pickle, saving to new dataframe
ufo_data_main_df = import_data_main_df_pickle.copy()

In [8]:
# cleanup
del import_data_main_df_csv
del import_data_main_df_pickle

In [9]:
# getting info
ufo_data_main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128175 entries, 0 to 128174
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   report_id       128175 non-null  int64 
 1   year_code       128175 non-null  int64 
 2   month_code      128175 non-null  int64 
 3   state_fipcode   128175 non-null  object
 4   county_fipcode  128175 non-null  object
 5   city_fipcode    128175 non-null  object
 6   fips_five       128175 non-null  object
 7   fips_nine       128175 non-null  object
dtypes: int64(3), object(5)
memory usage: 7.8+ MB


In [10]:
# visual inspection
ufo_data_main_df.sample(15)

Unnamed: 0,report_id,year_code,month_code,state_fipcode,county_fipcode,city_fipcode,fips_five,fips_nine
26006,18731,2001,7,49,C015,740,49015,490150740
105527,68132,2009,1,6,C111,2374,6111,61112374
65148,132095,2017,1,41,C051,880,41051,410510880
86326,98479,2013,6,25,C027,1380,25027,250271380
50839,156144,2020,5,26,C147,4060,26147,261474060
71908,121795,2015,9,36,C003,547,36003,360030547
97538,81479,2011,5,6,C065,2432,6065,60652432
48602,173152,2022,11,42,C003,6600,42003,420036600
82327,105422,2013,12,19,C183,70,19183,191830070
32209,3879,1998,6,32,C031,170,32031,320310170


In [23]:
# Creating new dataframe with aggregated count of each records that has the same year, month, state, county and fips five. Sorts this by descending of count and resets the index
ufo_data_main_agg_df = ufo_data_main_df.groupby(by=["year_code", "month_code", "state_fipcode", "county_fipcode","fips_five"]).size().sort_values(ascending=False).reset_index(name="sightings")

In [24]:
# sorting the values again this time by year and month, ascending
sorted_ufo_data_main_agg_df = ufo_data_main_agg_df.sort_values(['year_code', 'month_code'], ascending=True)

In [25]:
# Visual inspection
sorted_ufo_data_main_agg_df.head(50)

Unnamed: 0,year_code,month_code,state_fipcode,county_fipcode,fips_five,sightings
91501,1721,2,6,C015,6015,1
45900,1790,6,36,C095,36095,1
91517,1800,4,22,C033,22033,1
91516,1860,8,37,C173,37173,1
45830,1861,3,36,C061,36061,1
85860,1864,5,13,C115,13115,1
91532,1864,5,13,C007,13007,1
45770,1865,6,30,C017,30017,1
45773,1871,6,56,C113,56113,1
45780,1880,12,53,C073,53073,1


In [27]:
# getting info
sorted_ufo_data_main_agg_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91533 entries, 91501 to 91531
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_code       91533 non-null  int64 
 1   month_code      91533 non-null  int64 
 2   state_fipcode   91533 non-null  object
 3   county_fipcode  91533 non-null  object
 4   fips_five       91533 non-null  object
 5   sightings       91533 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 4.9+ MB


In [28]:
# Creating dataset of pre-cdc data sightings, anything with a data from before 1994. I went with 1994 incase I want to do a lagged pregnaacy time frame of 9 months
pre_cdc_dataset_sightings = sorted_ufo_data_main_agg_df.loc[sorted_ufo_data_main_agg_df['year_code'] < 1994]

In [29]:
# Getting info
pre_cdc_dataset_sightings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7803 entries, 91501 to 73170
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_code       7803 non-null   int64 
 1   month_code      7803 non-null   int64 
 2   state_fipcode   7803 non-null   object
 3   county_fipcode  7803 non-null   object
 4   fips_five       7803 non-null   object
 5   sightings       7803 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 426.7+ KB


In [30]:
# Created a new dataframe by filtering the aggergated data frame to include all items whose index does not match the index of an item in the pre_cdc dataset
in_scope_sightings = sorted_ufo_data_main_agg_df[~sorted_ufo_data_main_agg_df.index.isin(pre_cdc_dataset_sightings.index)]

In [31]:
# Getting info
in_scope_sightings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 83730 entries, 1949 to 91531
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_code       83730 non-null  int64 
 1   month_code      83730 non-null  int64 
 2   state_fipcode   83730 non-null  object
 3   county_fipcode  83730 non-null  object
 4   fips_five       83730 non-null  object
 5   sightings       83730 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 4.5+ MB


In [32]:
# programmatic check
assert (sorted_ufo_data_main_agg_df.shape[0] == (pre_cdc_dataset_sightings.shape[0] + in_scope_sightings.shape[0])) 

In [33]:
# visual Check
in_scope_sightings.head(25)

Unnamed: 0,year_code,month_code,state_fipcode,county_fipcode,fips_five,sightings
1949,1994,1,6,C037,6037,4
73171,1994,1,6,C053,6053,1
73175,1994,1,6,C089,6089,1
73177,1994,1,6,C111,6111,1
73185,1994,1,8,C117,8117,1
73187,1994,1,12,C011,12011,1
73192,1994,1,12,C075,12075,1
73196,1994,1,12,C227,12227,1
73206,1994,1,15,C003,15003,1
73209,1994,1,19,C059,19059,1


In [34]:
in_scope_sightings.sample(50)

Unnamed: 0,year_code,month_code,state_fipcode,county_fipcode,fips_five,sightings
81481,2021,5,4,C017,4017,1
73878,2019,6,25,C003,25003,1
89448,2024,2,50,C023,50023,1
43938,2011,5,12,C073,12073,1
18022,2022,5,25,C007,25007,2
55838,2014,4,34,C027,34027,1
60250,2015,4,17,C181,17181,1
69781,2017,12,6,C045,6045,1
65416,2016,8,27,C013,27013,1
56607,2014,6,35,C029,35029,1


In [35]:
# making year codea nd month code object type
in_scope_sightings['year_code'] = in_scope_sightings['year_code'].astype("object")
in_scope_sightings['month_code'] = in_scope_sightings['month_code'].astype("object")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  in_scope_sightings['year_code'] = in_scope_sightings['year_code'].astype("object")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  in_scope_sightings['month_code'] = in_scope_sightings['month_code'].astype("object")


In [36]:
# getting info
in_scope_sightings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 83730 entries, 1949 to 91531
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_code       83730 non-null  object
 1   month_code      83730 non-null  object
 2   state_fipcode   83730 non-null  object
 3   county_fipcode  83730 non-null  object
 4   fips_five       83730 non-null  object
 5   sightings       83730 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 4.5+ MB


In [37]:
# visual inspection
in_scope_sightings.sample()

Unnamed: 0,year_code,month_code,state_fipcode,county_fipcode,fips_five,sightings
10949,2014,6,31,C055,31055,2


In [38]:
# Exporting dataframe into sql database as ufo_data_agg table 
db_tool.export_to_sql(in_scope_sightings, "ufo_data_agg")

2025-01-20 19:28:27,152 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-01-20 19:28:27,162 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("ufo_data_agg")
2025-01-20 19:28:27,163 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-01-20 19:28:27,166 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("ufo_data_agg")
2025-01-20 19:28:27,167 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-01-20 19:28:27,169 INFO sqlalchemy.engine.Engine 
CREATE TABLE ufo_data_agg (
	year_code BIGINT, 
	month_code BIGINT, 
	state_fipcode TEXT, 
	county_fipcode TEXT, 
	fips_five TEXT, 
	sightings BIGINT
)


2025-01-20 19:28:27,170 INFO sqlalchemy.engine.Engine [no key 0.00066s] ()
2025-01-20 19:28:27,554 INFO sqlalchemy.engine.Engine INSERT INTO ufo_data_agg (year_code, month_code, state_fipcode, county_fipcode, fips_five, sightings) VALUES (?, ?, ?, ?, ?, ?)
2025-01-20 19:28:27,555 INFO sqlalchemy.engine.Engine [generated in 0.28133s] [(1994, 1, '06', 'C037', '06037', 4), (1994, 1, '06', 'C053', '06

## [Next Step: Data Analysis](data_analysis_main.ipynb)
---
#### [Return To Landing Page](order_of_operations_landing.ipynb)
