In [1]:
# Data Analysis - UFO Data Aggregateion - By Year, Month, State, County
# Version 1
## 1/13/25
## 18:45

In [2]:
# Importing Modules

# Import sys
import sys

# Import OS
import os

# Add the root directory to sys.path
sys.path.append(os.path.abspath(".."))

# Now you can use absolute imports
from d497_helpers import folder_manager as fm, archive_module, checkpoint_helper as CheckPoint, database_helper as db_tool, config

#### #### #### #### 

# Pandas
import pandas as pd

# Numpy
import numpy as np

# RegularExpression for string matching
import re



In [3]:
# function gets cleaned dataframe from specified folder and for specified file 
def get_cleaned_dataframe(data_source, file_name):
    
    global import_data_main_df_csv
    global import_data_main_df_pickle

    import_ufo_csv_dtypes = {
        "report_id" : "object",
        "year_code" : "object",
        "month_code" : "object",
        "state_fipcode" : "object",
        "county_fipcode" : "object",
        "city_fipcode" : "object",
        "fips_five" : "object",
        "fips_nine" : "object"
        }

    import_cdc_csv_dtypes = {
        "year_code" : "object",
        "month_code" : "object",
        "state_code" : "object",
        "county_code" : "object",
        "births" : "int"
        }

    if data_source == "ufo_data":
        folder_path = config.global_cleaned_ufo_data_folder_path
        csv_dtypes = import_ufo_csv_dtypes
    elif data_source == "cdc_data":
        folder_path = config.global_cleaned_cdc_data_folder_path
        csv_dtypes = import_cdc_csv_dtypes    
    else:
        folder_path = config.global_cleaned_fips_data_folder_path
        csv_dtypes = import_fips_csv_dtypes

    temp_file_name = file_name
    
    file_path_csv = folder_path + "\\" + temp_file_name + ".csv"
    file_path_pickle = folder_path + "\\" + temp_file_name  + ".pkl"
    
    temp_main_df_csv = pd.read_csv(file_path_csv, sep=",", dtype=csv_dtypes)
    temp_main_df_pickle = pd.read_pickle(file_path_pickle)

    import_data_main_df_csv = temp_main_df_csv.copy()
    import_data_main_df_pickle = temp_main_df_pickle.copy()
    
    del temp_main_df_csv
    del temp_main_df_pickle

In [4]:
# Initializes Database Engine and Session For SQL
db_tool.initialize_engine()
db_tool.initialize_session()

# Get ufo data from cleaned directory
get_cleaned_dataframe("ufo_data", "ufo_data_main_df")

In [5]:
# Getting csv import info
import_data_main_df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128365 entries, 0 to 128364
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   report_id       128365 non-null  object
 1   year_code       128365 non-null  object
 2   month_code      128365 non-null  object
 3   state_fipcode   128365 non-null  object
 4   county_fipcode  128365 non-null  object
 5   city_fipcode    128365 non-null  object
 6   fips_five       128365 non-null  object
 7   fips_nine       128365 non-null  object
dtypes: object(8)
memory usage: 7.8+ MB


In [6]:
# Getting pickle import info
import_data_main_df_pickle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128365 entries, 0 to 128364
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   report_id       128365 non-null  int64 
 1   year_code       128365 non-null  int64 
 2   month_code      128365 non-null  int64 
 3   state_fipcode   128365 non-null  object
 4   county_fipcode  128365 non-null  object
 5   city_fipcode    128365 non-null  object
 6   fips_five       128365 non-null  object
 7   fips_nine       128365 non-null  object
dtypes: int64(3), object(5)
memory usage: 7.8+ MB


In [7]:
# keeping pickle, saving to new dataframe
ufo_data_main_df = import_data_main_df_pickle.copy()

In [8]:
# cleanup
del import_data_main_df_csv
del import_data_main_df_pickle

In [9]:
# getting info
ufo_data_main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128365 entries, 0 to 128364
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   report_id       128365 non-null  int64 
 1   year_code       128365 non-null  int64 
 2   month_code      128365 non-null  int64 
 3   state_fipcode   128365 non-null  object
 4   county_fipcode  128365 non-null  object
 5   city_fipcode    128365 non-null  object
 6   fips_five       128365 non-null  object
 7   fips_nine       128365 non-null  object
dtypes: int64(3), object(5)
memory usage: 7.8+ MB


In [10]:
# visual inspection
ufo_data_main_df.sample(15)

Unnamed: 0,report_id,year_code,month_code,state_fipcode,county_fipcode,city_fipcode,fips_five,fips_nine
6857,47179,2005,10,34,C017,3590,34017,340173590
11867,163467,2021,5,16,C021,170,16021,160210170
30690,8427,1999,7,8,C067,650,8067,80670650
105616,68583,2009,2,53,C033,532,53033,530330532
115954,129434,2016,8,12,C201,6553,12201,122016553
13133,161143,2020,12,51,C073,1105,51073,510731105
59727,142882,2018,8,49,C057,1490,49057,490571490
48670,173699,2022,12,48,C339,5434,48339,483395434
5072,51289,2006,6,26,C033,4480,26033,260334480
59032,144232,2018,11,26,C005,5130,26005,260055130


In [11]:
# Creating new dataframe with aggregated count of each records that has the same year, month, state, county and fips five. Sorts this by descending of count and resets the index
ufo_data_main_agg_df = ufo_data_main_df.groupby(by=["year_code", "month_code", "state_fipcode", "county_fipcode","fips_five"]).size().sort_values(ascending=False).reset_index(name="sightings")

In [12]:
# sorting the values again this time by year and month, ascending
sorted_ufo_data_main_agg_df = ufo_data_main_agg_df.sort_values(['year_code', 'month_code'], ascending=True)

In [13]:
# Visual inspection
sorted_ufo_data_main_agg_df.head(50)

Unnamed: 0,year_code,month_code,state_fipcode,county_fipcode,fips_five,sightings
91668,1721,2,6,C015,6015,1
45838,1790,6,36,C095,36095,1
45844,1800,4,22,C033,22033,1
45845,1860,8,37,C173,37173,1
45848,1861,3,36,C061,36061,1
91638,1864,5,13,C115,13115,1
91652,1864,5,13,C007,13007,1
91639,1865,6,30,C017,30017,1
91640,1871,6,56,C113,56113,1
91641,1880,12,53,C073,53073,1


In [14]:
# getting info
sorted_ufo_data_main_agg_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91669 entries, 91668 to 91667
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_code       91669 non-null  int64 
 1   month_code      91669 non-null  int64 
 2   state_fipcode   91669 non-null  object
 3   county_fipcode  91669 non-null  object
 4   fips_five       91669 non-null  object
 5   sightings       91669 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 4.9+ MB


In [15]:
# Creating dataset of pre-cdc data sightings, anything with a data from before 1994. I went with 1994 incase I want to do a lagged pregnaacy time frame of 9 months
pre_cdc_dataset_sightings = sorted_ufo_data_main_agg_df.loc[sorted_ufo_data_main_agg_df['year_code'] < 1994]

In [16]:
# Getting info
pre_cdc_dataset_sightings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7814 entries, 91668 to 73279
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_code       7814 non-null   int64 
 1   month_code      7814 non-null   int64 
 2   state_fipcode   7814 non-null   object
 3   county_fipcode  7814 non-null   object
 4   fips_five       7814 non-null   object
 5   sightings       7814 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 427.3+ KB


In [17]:
# Created a new dataframe by filtering the aggergated data frame to include all items whose index does not match the index of an item in the pre_cdc dataset
in_scope_sightings = sorted_ufo_data_main_agg_df[~sorted_ufo_data_main_agg_df.index.isin(pre_cdc_dataset_sightings.index)]

In [18]:
# Getting info
in_scope_sightings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 83855 entries, 2561 to 91667
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_code       83855 non-null  int64 
 1   month_code      83855 non-null  int64 
 2   state_fipcode   83855 non-null  object
 3   county_fipcode  83855 non-null  object
 4   fips_five       83855 non-null  object
 5   sightings       83855 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 4.5+ MB


In [19]:
# programmatic check
assert (sorted_ufo_data_main_agg_df.shape[0] == (pre_cdc_dataset_sightings.shape[0] + in_scope_sightings.shape[0])) 

In [20]:
# visual Check
in_scope_sightings.head(25)

Unnamed: 0,year_code,month_code,state_fipcode,county_fipcode,fips_five,sightings
2561,1994,1,6,C037,6037,4
73285,1994,1,6,C053,6053,1
73286,1994,1,6,C089,6089,1
73288,1994,1,6,C111,6111,1
73289,1994,1,8,C117,8117,1
73290,1994,1,12,C011,12011,1
73291,1994,1,12,C075,12075,1
73292,1994,1,12,C227,12227,1
73297,1994,1,15,C003,15003,1
73317,1994,1,19,C059,19059,1


In [21]:
in_scope_sightings.sample(50)

Unnamed: 0,year_code,month_code,state_fipcode,county_fipcode,fips_five,sightings
20224,1999,8,29,C019,29019,1
84230,1997,12,32,C003,32003,1
64962,2016,7,8,C067,8067,1
60755,2015,6,4,C025,4025,1
9200,2012,9,54,C061,54061,2
11792,2014,12,19,C067,19067,2
32315,2006,2,6,C123,6123,1
4663,2015,8,34,C023,34023,3
88043,2023,8,8,C001,8001,1
38029,2008,9,12,C035,12035,1


In [22]:
# making year codea nd month code object type
in_scope_sightings['year_code'] = in_scope_sightings['year_code'].astype("object")
in_scope_sightings['month_code'] = in_scope_sightings['month_code'].astype("object")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  in_scope_sightings['year_code'] = in_scope_sightings['year_code'].astype("object")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  in_scope_sightings['month_code'] = in_scope_sightings['month_code'].astype("object")


In [23]:
# getting info
in_scope_sightings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 83855 entries, 2561 to 91667
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_code       83855 non-null  object
 1   month_code      83855 non-null  object
 2   state_fipcode   83855 non-null  object
 3   county_fipcode  83855 non-null  object
 4   fips_five       83855 non-null  object
 5   sightings       83855 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 4.5+ MB


In [24]:
# visual inspection
in_scope_sightings.sample()

Unnamed: 0,year_code,month_code,state_fipcode,county_fipcode,fips_five,sightings
88814,2023,10,47,C161,47161,1


In [25]:
# Exporting dataframe into sql database as ufo_data_agg table 
db_tool.export_to_sql(in_scope_sightings, "ufo_data_agg")

2025-02-01 00:33:17,345 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-02-01 00:33:17,353 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("ufo_data_agg")
2025-02-01 00:33:17,354 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-02-01 00:33:17,357 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("ufo_data_agg")
2025-02-01 00:33:17,358 INFO sqlalchemy.engine.Engine [raw sql] ()
2025-02-01 00:33:17,359 INFO sqlalchemy.engine.Engine 
CREATE TABLE ufo_data_agg (
	year_code BIGINT, 
	month_code BIGINT, 
	state_fipcode TEXT, 
	county_fipcode TEXT, 
	fips_five TEXT, 
	sightings BIGINT
)


2025-02-01 00:33:17,360 INFO sqlalchemy.engine.Engine [no key 0.00079s] ()
2025-02-01 00:33:17,744 INFO sqlalchemy.engine.Engine INSERT INTO ufo_data_agg (year_code, month_code, state_fipcode, county_fipcode, fips_five, sightings) VALUES (?, ?, ?, ?, ?, ?)
2025-02-01 00:33:17,745 INFO sqlalchemy.engine.Engine [generated in 0.28974s] [(1994, 1, '06', 'C037', '06037', 4), (1994, 1, '06', 'C053', '06

## [Next Step: Data Analysis](data_analysis_main.ipynb)
---
#### [Return To Landing Page](order_of_operations_landing.ipynb)
