## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np 
import geopandas as gpd
import matplotlib.pyplot as plt
import os
import xarray as xr

## Read al the csv files

In [2]:
import glob
folder_path = r'F:\Sam\NLDN_LIS\*.csv'
csv_files = glob.glob(folder_path)
data= []
for file in csv_files:
    df = pd.read_csv(file, skiprows=2)
    data.append(df)
combined_data = pd.concat(data, ignore_index = True)
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28048354 entries, 0 to 28048353
Data columns (total 4 columns):
 #   Column       Dtype  
---  ------       -----  
 0   #ZDAY        int64  
 1   CENTERLON    float64
 2   CENTERLAT    float64
 3   TOTAL_COUNT  int64  
dtypes: float64(2), int64(2)
memory usage: 856.0 MB


In [3]:
# rename date column to time
combined_data.rename(columns = {'#ZDAY': 'Time'}, inplace = True)

In [4]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28048354 entries, 0 to 28048353
Data columns (total 4 columns):
 #   Column       Dtype  
---  ------       -----  
 0   Time         int64  
 1   CENTERLON    float64
 2   CENTERLAT    float64
 3   TOTAL_COUNT  int64  
dtypes: float64(2), int64(2)
memory usage: 856.0 MB


In [5]:
## convert time column to datetime
combined_data['Time'] = pd.to_datetime(combined_data['Time'] , format='%Y%m%d')

In [6]:
combined_data = combined_data.rename(columns = {'CENTERLON': 'Lon', 'CENTERLAT': 'Lat', 'TOTAL_COUNT': 'Counts'})
combined_data.head()

Unnamed: 0,Time,Lon,Lat,Counts
0,2015-01-01,-99.6,29.3,2
1,2015-01-01,-99.1,29.8,1
2,2015-01-01,-98.7,29.8,2
3,2015-01-01,-66.9,36.3,1
4,2015-01-01,-97.3,30.4,1


In [7]:
# Read the shapefile
geo_data = gpd.read_file(r"F:\Sam\DOE_CARES\Florida_county.shp")

# Reproject to the specified coordinate reference system (CRS)
geo_data = geo_data.to_crs(epsg=4326)

# Convert MultiPolygon geometries to their first Polygon
geo_data['geometry'] = [geom.geoms[0] if geom.geom_type == 'MultiPolygon' else geom for geom in geo_data['geometry']]

In [8]:
## convert to geodataframe
points_gdf = gpd.GeoDataFrame(combined_data, geometry=gpd.points_from_xy(combined_data.Lon, combined_data.Lat))

In [9]:
# make sure they are under same crs
points_gdf.crs = "EPSG:4326"

In [10]:
# Do spatial join get points within a county
joined_data = gpd.sjoin(points_gdf, geo_data, predicate="within")

In [11]:
joined_data.head()

Unnamed: 0,Time,Lon,Lat,Counts,geometry,index_right,STATEFP,COUNTYFP,COUNTYNS,GEOID,...,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON
7061,2015-01-04,-85.4,29.6,16,POINT (-85.40000 29.60000),32,12,5,295738,12005,...,H1,G4020,,37460,,A,1964874034,1833915193,30.1591402,-85.5343954
7129,2015-01-04,-85.8,30.4,2,POINT (-85.80000 30.40000),32,12,5,295738,12005,...,H1,G4020,,37460,,A,1964874034,1833915193,30.1591402,-85.5343954
7358,2015-01-04,-85.6,29.9,4,POINT (-85.60000 29.90000),32,12,5,295738,12005,...,H1,G4020,,37460,,A,1964874034,1833915193,30.1591402,-85.5343954
7382,2015-01-04,-85.5,30.0,4,POINT (-85.50000 30.00000),32,12,5,295738,12005,...,H1,G4020,,37460,,A,1964874034,1833915193,30.1591402,-85.5343954
7399,2015-01-04,-85.5,30.1,2,POINT (-85.50000 30.10000),32,12,5,295738,12005,...,H1,G4020,,37460,,A,1964874034,1833915193,30.1591402,-85.5343954


In [12]:
## Choose only necessary columns for further processing
flash_data = joined_data[['Time', 'Counts', 'NAME']]

In [None]:
# Iterate over each county and write csv files
for county_name, county_group in flash_data.groupby("NAME"):

    # Define the CSV file path for the current county
    csv_file_path = os.path.join(r"F:\Sam\NLDN_LIS\county", f"{county_name}_points.csv")
    county_group.drop('NAME', axis =1)
    county_group.set_index('Time', inplace = True)
    county_group = county_group.resample('D').sum()
    # Export the points to a CSV file
    county_group.to_csv(csv_file_path, index=True)
    print(f'Processing completed for {county_name}')