In [3]:
import xarray as xr
import pandas as pd
import numpy as np

# Open the NetCDF file
ds = xr.open_dataset('Downloads/M2T1NXAER_5.12.4-20250117_084759/HR2DAY_LST_ACONC_EQUATES_v532_12US1_2016.nc')

# Extract the variables
lat = ds['x'].values
lon = ds['y'].values
time = ds['time'].values
pm25 = ds['PM25_AVG'].values

# Create meshgrid of latitude and longitude
lon_grid, lat_grid = np.meshgrid(lon, lat)

# Flatten the spatial coordinates
lat_flat = lat_grid.flatten()
lon_flat = lon_grid.flatten()

# Initialize list to store DataFrames for each timestamp
dfs = []

# Process each timestamp
for t in range(len(time)):
    # Extract PM2.5 data for this timestamp and flatten
    pm25_slice = pm25[t].flatten()
    
    # Create DataFrame for this timestamp
    df_t = pd.DataFrame({
        'Latitude': lat_flat,
        'Longitude': lon_flat,
        'PM2.5': pm25_slice
    })
    
    # Remove any rows with missing PM2.5 values
    df_t = df_t.dropna()
    
    dfs.append(df_t)

# Combine all timestamps into one DataFrame
# Each timestamp will be a new column
final_df = pd.concat([df.set_index(['Latitude', 'Longitude'])['PM2.5'] for df in dfs], axis=1)
final_df.columns = pd.to_datetime(time)

# Save to CSV
final_df.to_csv('pm25_data.csv')

# Close the dataset
ds.close()

print("Data has been processed and saved to 'pm25_data.csv'")
print(f"Shape of final dataset: {final_df.shape}")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\priom\\Downloads\\M2T1NXAER_5.12.4-20250117_084759\\Downloads\\M2T1NXAER_5.12.4-20250117_084759\\HR2DAY_LST_ACONC_EQUATES_v532_12US1_2016.nc'

In [5]:
import xarray as xr
import pandas as pd
import numpy as np

# Use the correct path - adjust this to match your actual file location
ds = xr.open_dataset('C:/Users/priom/Downloads/M2T1NXAER_5.12.4-20250117_084759/HR2DAY_LST_ACONC_EQUATES_v532_12US1_2016.nc')

# Rest of the code remains the same as before
lat = ds['x'].values
lon = ds['y'].values
time = ds['time'].values
pm25 = ds['PM25_AVG'].values

KeyError: 'x'

In [7]:
import xarray as xr

# Open the file
ds = xr.open_dataset('C:/Users/priom/Downloads/M2T1NXAER_5.12.4-20250117_084759/HR2DAY_LST_ACONC_EQUATES_v532_12US1_2016.nc')

# Print information about the dataset
print("Dataset info:")
print(ds.info())

# Print all variable names
print("\nAvailable variables:")
for var_name in ds.variables:
    print(var_name)

# Print all dimension names
print("\nDimensions:")
print(ds.dims)

Dataset info:
xarray.Dataset {
dimensions:
	TSTEP = 366 ;
	VAR = 14 ;
	DATE-TIME = 2 ;
	LAY = 1 ;
	ROW = 299 ;
	COL = 459 ;

variables:
	int32 TFLAG(TSTEP, VAR, DATE-TIME) ;
		TFLAG:units = <YYYYDDD,HHMMSS> ;
		TFLAG:long_name = TFLAG            ;
		TFLAG:var_desc = Timestep-valid flags:  (1) YYYYDDD or (2) HHMMSS                                 ;
	float32 O3_MDA8(TSTEP, LAY, ROW, COL) ;
		O3_MDA8:long_name = O3_MDA8          ;
		O3_MDA8:units = ppbV             ;
		O3_MDA8:var_desc = Max-8-hour                                                                       ;
	float32 O3_AVG(TSTEP, LAY, ROW, COL) ;
		O3_AVG:long_name = O3_AVG           ;
		O3_AVG:units = ppbV             ;
		O3_AVG:var_desc = Daily-average                                                                    ;
	float32 CO_AVG(TSTEP, LAY, ROW, COL) ;
		CO_AVG:long_name = CO_AVG           ;
		CO_AVG:units = ppbV             ;
		CO_AVG:var_desc = Daily-average                                                           

In [9]:
import xarray as xr
import pandas as pd
import numpy as np

# Open the file
ds = xr.open_dataset('C:/Users/priom/Downloads/M2T1NXAER_5.12.4-20250117_084759/HR2DAY_LST_ACONC_EQUATES_v532_12US1_2016.nc')

# Get grid parameters from global attributes
xcell = ds.attrs['XCELL']  # grid cell size in x direction
ycell = ds.attrs['YCELL']  # grid cell size in y direction
xorig = ds.attrs['XORIG']  # x-coordinate of origin
yorig = ds.attrs['YORIG']  # y-coordinate of origin

# Create coordinate arrays
x_coords = xorig + xcell * np.arange(ds.dims['COL'])
y_coords = yorig + ycell * np.arange(ds.dims['ROW'])

# Create latitude/longitude meshgrid
xx, yy = np.meshgrid(x_coords, y_coords)

# Get PM2.5 data
pm25_data = ds['PM25_AVG'].values

# Get time information from TFLAG
# TFLAG contains date information in YYYYDDD format
time_flags = ds['TFLAG'][:, 0, 0]  # Get the date part for the first variable
# Convert YYYYDDD to datetime
dates = pd.to_datetime([f"{str(flag)[:4]}-01-01" for flag in time_flags]) + \
        pd.to_timedelta([int(str(flag)[4:]) - 1 for flag in time_flags], unit='D')

# Create DataFrame for each timestep
dfs = []
for t in range(len(dates)):
    # Extract PM2.5 data for this timestamp (taking first layer)
    pm25_slice = pm25_data[t, 0]
    
    # Create DataFrame
    df_t = pd.DataFrame({
        'X_Coord': xx.flatten(),
        'Y_Coord': yy.flatten(),
        'PM2.5': pm25_slice.flatten()
    })
    
    # Remove any missing values
    df_t = df_t.dropna()
    dfs.append(df_t)

# Combine all timestamps
final_df = pd.concat([df.set_index(['X_Coord', 'Y_Coord'])['PM2.5'] for df in dfs], axis=1)
final_df.columns = dates

# Save to CSV
final_df.to_csv('pm25_data.csv')

# Close the dataset
ds.close()

print("Data has been processed and saved to 'pm25_data.csv'")
print(f"Shape of final dataset: {final_df.shape}")

DateParseError: Unknown datetime string format, unable to parse: <xar-01-01, at position 0

In [11]:
import xarray as xr
import pandas as pd
import numpy as np

# Open the file
ds = xr.open_dataset('C:/Users/priom/Downloads/M2T1NXAER_5.12.4-20250117_084759/HR2DAY_LST_ACONC_EQUATES_v532_12US1_2016.nc')

# Get grid parameters from global attributes
xcell = ds.attrs['XCELL']  # grid cell size in x direction
ycell = ds.attrs['YCELL']  # grid cell size in y direction
xorig = ds.attrs['XORIG']  # x-coordinate of origin
yorig = ds.attrs['YORIG']  # y-coordinate of origin

# Create coordinate arrays
x_coords = xorig + xcell * np.arange(ds.dims['COL'])
y_coords = yorig + ycell * np.arange(ds.dims['ROW'])

# Create latitude/longitude meshgrid
xx, yy = np.meshgrid(x_coords, y_coords)

# Get PM2.5 data
pm25_data = ds['PM25_AVG'].values

# Get time information from TFLAG
time_flags = ds['TFLAG'][:, 0, 0].values  # Get the date part for the first variable

# Create dates list (assuming SDATE from global attributes indicates start date)
start_date = pd.Timestamp(str(ds.attrs['SDATE']))  # Should be 2016001
dates = [start_date + pd.Timedelta(days=i) for i in range(len(time_flags))]

# Create DataFrame for each timestep
dfs = []
for t in range(len(dates)):
    # Extract PM2.5 data for this timestamp (taking first layer)
    pm25_slice = pm25_data[t, 0]
    
    # Create DataFrame
    df_t = pd.DataFrame({
        'X_Coord': xx.flatten(),
        'Y_Coord': yy.flatten(),
        'PM2.5': pm25_slice.flatten()
    })
    
    # Remove any missing values
    df_t = df_t.dropna()
    dfs.append(df_t)

# Combine all timestamps
final_df = pd.concat([df.set_index(['X_Coord', 'Y_Coord'])['PM2.5'] for df in dfs], axis=1)
final_df.columns = dates

# Save to CSV
final_df.to_csv('pm25_data.csv')

# Close the dataset
ds.close()

print("Data has been processed and saved to 'pm25_data.csv'")
print(f"Shape of final dataset: {final_df.shape}")

DateParseError: year 2016001 is out of range: 2016001

In [13]:
import xarray as xr
import pandas as pd
import numpy as np

# Open the file
ds = xr.open_dataset('C:/Users/priom/Downloads/M2T1NXAER_5.12.4-20250117_084759/HR2DAY_LST_ACONC_EQUATES_v532_12US1_2016.nc')

# Get grid parameters from global attributes
xcell = ds.attrs['XCELL']  # grid cell size in x direction
ycell = ds.attrs['YCELL']  # grid cell size in y direction
xorig = ds.attrs['XORIG']  # x-coordinate of origin
yorig = ds.attrs['YORIG']  # y-coordinate of origin

# Create coordinate arrays
x_coords = xorig + xcell * np.arange(ds.dims['COL'])
y_coords = yorig + ycell * np.arange(ds.dims['ROW'])

# Create latitude/longitude meshgrid
xx, yy = np.meshgrid(x_coords, y_coords)

# Get PM2.5 data
pm25_data = ds['PM25_AVG'].values

# Parse SDATE (format: YYYYDDD)
sdate_str = str(ds.attrs['SDATE'])
year = int(sdate_str[:4])
day_of_year = int(sdate_str[4:])
start_date = pd.Timestamp(year, 1, 1) + pd.Timedelta(days=day_of_year-1)

# Create dates list
dates = [start_date + pd.Timedelta(days=i) for i in range(ds.dims['TSTEP'])]

# Create DataFrame for each timestep
dfs = []
for t in range(len(dates)):
   # Extract PM2.5 data for this timestamp (taking first layer)
   pm25_slice = pm25_data[t, 0]
   
   # Create DataFrame
   df_t = pd.DataFrame({
       'X_Coord': xx.flatten(),
       'Y_Coord': yy.flatten(),
       'PM2.5': pm25_slice.flatten()
   })
   
   # Remove any missing values
   df_t = df_t.dropna()
   dfs.append(df_t)

# Combine all timestamps
final_df = pd.concat([df.set_index(['X_Coord', 'Y_Coord'])['PM2.5'] for df in dfs], axis=1)
final_df.columns = dates

# Preview the data
print("\nPreview of the first 5 rows and 5 columns of the data:")
print(final_df.iloc[:5, :5])

print("\nDataFrame Info:")
print(final_df.info())

print("\nBasic statistics of PM2.5 values:")
print(final_df.describe())

# Save to CSV
final_df.to_csv('pm25_data.csv')

# Close the dataset
ds.close()

print("\nData has been processed and saved to 'pm25_data.csv'")
print(f"Shape of final dataset: {final_df.shape}")


Preview of the first 5 rows and 5 columns of the data:
                       2016-01-01  2016-01-02  2016-01-03  2016-01-04  \
X_Coord    Y_Coord                                                      
-2556000.0 -1728000.0    1.413358    1.331079    1.376337    2.190387   
-2544000.0 -1728000.0    1.398102    1.331326    1.389859    2.272398   
-2532000.0 -1728000.0    1.372702    1.334618    1.404449    2.367060   
-2520000.0 -1728000.0    1.354613    1.340229    1.406716    2.481329   
-2508000.0 -1728000.0    1.335252    1.340366    1.415844    2.504131   

                       2016-01-05  
X_Coord    Y_Coord                 
-2556000.0 -1728000.0    4.519111  
-2544000.0 -1728000.0    4.704486  
-2532000.0 -1728000.0    4.892936  
-2520000.0 -1728000.0    4.997709  
-2508000.0 -1728000.0    5.047138  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 137241 entries, (-2556000.0, -1728000.0) to (2940000.0, 1848000.0)
Columns: 366 entries, 2016-01-01 to 2016-12-31

In [15]:
import xarray as xr
import pandas as pd
import numpy as np
from pyproj import Transformer

# Open the file
ds = xr.open_dataset('C:/Users/priom/Downloads/M2T1NXAER_5.12.4-20250117_084759/HR2DAY_LST_ACONC_EQUATES_v532_12US1_2016.nc')

# Get projection parameters from global attributes
p_alp = ds.attrs['P_ALP']  # First projection parallel
p_bet = ds.attrs['P_BET']  # Second projection parallel
p_gam = ds.attrs['P_GAM']  # Central meridian
xcent = ds.attrs['XCENT']  # Projection center longitude
ycent = ds.attrs['YCENT']  # Projection center latitude

# Create the projection transformer
proj_str = f"+proj=lcc +lat_1={p_alp} +lat_2={p_bet} +lat_0={ycent} +lon_0={xcent} +x_0=0 +y_0=0 +ellps=sphere +units=m +no_defs"
transformer = Transformer.from_crs(proj_str, "EPSG:4326", always_xy=True)

# Create coordinate arrays
x_coords = np.arange(ds.dims['COL']) * ds.attrs['XCELL'] + ds.attrs['XORIG']
y_coords = np.arange(ds.dims['ROW']) * ds.attrs['YCELL'] + ds.attrs['YORIG']

# Create meshgrid
xx, yy = np.meshgrid(x_coords, y_coords)

# Transform coordinates to lat/lon
lons, lats = transformer.transform(xx, yy)

# Get PM2.5 data
pm25_data = ds['PM25_AVG'].values

# Parse SDATE (format: YYYYDDD)
sdate_str = str(ds.attrs['SDATE'])
year = int(sdate_str[:4])
day_of_year = int(sdate_str[4:])
start_date = pd.Timestamp(year, 1, 1) + pd.Timedelta(days=day_of_year-1)

# Create dates list
dates = [start_date + pd.Timedelta(days=i) for i in range(ds.dims['TSTEP'])]

# Create DataFrame for each timestep
dfs = []
for t in range(len(dates)):
    # Extract PM2.5 data for this timestamp (taking first layer)
    pm25_slice = pm25_data[t, 0]
    
    # Create DataFrame
    df_t = pd.DataFrame({
        'Latitude': lats.flatten(),
        'Longitude': lons.flatten(),
        'PM2.5': pm25_slice.flatten()
    })
    
    # Remove any missing values
    df_t = df_t.dropna()
    dfs.append(df_t)

# Combine all timestamps
final_df = pd.concat([df.set_index(['Latitude', 'Longitude'])['PM2.5'] for df in dfs], axis=1)
final_df.columns = dates

# Preview the data
print("\nPreview of the first 5 rows and 5 columns of the data:")
print(final_df.iloc[:5, :5])

print("\nDataFrame Info:")
print(final_df.info())

print("\nBasic statistics of PM2.5 values:")
print(final_df.describe())

# Save to CSV
final_df.to_csv('pm25_data.csv')

# Close the dataset
ds.close()

print("\nData has been processed and saved to 'pm25_data.csv'")
print(f"Shape of final dataset: {final_df.shape}")

ModuleNotFoundError: No module named 'pyproj'

In [17]:
pip install pyproj


Collecting pyproj
  Downloading pyproj-3.7.0-cp312-cp312-win_amd64.whl.metadata (31 kB)
Downloading pyproj-3.7.0-cp312-cp312-win_amd64.whl (6.2 MB)
   ---------------------------------------- 0.0/6.2 MB ? eta -:--:--
   ---------- ----------------------------- 1.6/6.2 MB 8.4 MB/s eta 0:00:01
   ------------- -------------------------- 2.1/6.2 MB 6.5 MB/s eta 0:00:01
   --------------- ------------------------ 2.4/6.2 MB 5.2 MB/s eta 0:00:01
   ---------------- ----------------------- 2.6/6.2 MB 3.6 MB/s eta 0:00:02
   ------------------ --------------------- 2.9/6.2 MB 2.8 MB/s eta 0:00:02
   -------------------- ------------------- 3.1/6.2 MB 2.4 MB/s eta 0:00:02
   -------------------- ------------------- 3.1/6.2 MB 2.4 MB/s eta 0:00:02
   --------------------- ------------------ 3.4/6.2 MB 2.2 MB/s eta 0:00:02
   ------------------------- -------------- 3.9/6.2 MB 2.1 MB/s eta 0:00:02
   -------------------------- ------------- 4.2/6.2 MB 2.1 MB/s eta 0:00:01
   --------------------

In [19]:
import xarray as xr
import pandas as pd
import numpy as np
from pyproj import Transformer

# Open the file
ds = xr.open_dataset('C:/Users/priom/Downloads/M2T1NXAER_5.12.4-20250117_084759/HR2DAY_LST_ACONC_EQUATES_v532_12US1_2016.nc')

# Get projection parameters from global attributes
p_alp = ds.attrs['P_ALP']  # First projection parallel
p_bet = ds.attrs['P_BET']  # Second projection parallel
p_gam = ds.attrs['P_GAM']  # Central meridian
xcent = ds.attrs['XCENT']  # Projection center longitude
ycent = ds.attrs['YCENT']  # Projection center latitude

# Create the projection transformer
proj_str = f"+proj=lcc +lat_1={p_alp} +lat_2={p_bet} +lat_0={ycent} +lon_0={xcent} +x_0=0 +y_0=0 +ellps=sphere +units=m +no_defs"
transformer = Transformer.from_crs(proj_str, "EPSG:4326", always_xy=True)

# Create coordinate arrays
x_coords = np.arange(ds.dims['COL']) * ds.attrs['XCELL'] + ds.attrs['XORIG']
y_coords = np.arange(ds.dims['ROW']) * ds.attrs['YCELL'] + ds.attrs['YORIG']

# Create meshgrid
xx, yy = np.meshgrid(x_coords, y_coords)

# Transform coordinates to lat/lon
lons, lats = transformer.transform(xx, yy)

# Get PM2.5 data
pm25_data = ds['PM25_AVG'].values

# Parse SDATE (format: YYYYDDD)
sdate_str = str(ds.attrs['SDATE'])
year = int(sdate_str[:4])
day_of_year = int(sdate_str[4:])
start_date = pd.Timestamp(year, 1, 1) + pd.Timedelta(days=day_of_year-1)

# Create dates list
dates = [start_date + pd.Timedelta(days=i) for i in range(ds.dims['TSTEP'])]

# Create DataFrame for each timestep
dfs = []
for t in range(len(dates)):
    # Extract PM2.5 data for this timestamp (taking first layer)
    pm25_slice = pm25_data[t, 0]
    
    # Create DataFrame
    df_t = pd.DataFrame({
        'Latitude': lats.flatten(),
        'Longitude': lons.flatten(),
        'PM2.5': pm25_slice.flatten()
    })
    
    # Remove any missing values
    df_t = df_t.dropna()
    dfs.append(df_t)

# Combine all timestamps
final_df = pd.concat([df.set_index(['Latitude', 'Longitude'])['PM2.5'] for df in dfs], axis=1)
final_df.columns = dates

# Preview the data
print("\nPreview of the first 5 rows and 5 columns of the data:")
print(final_df.iloc[:5, :5])

print("\nDataFrame Info:")
print(final_df.info())

print("\nBasic statistics of PM2.5 values:")
print(final_df.describe())

# Save to CSV
final_df.to_csv('pm25_data.csv')

# Close the dataset
ds.close()

print("\nData has been processed and saved to 'pm25_data.csv'")
print(f"Shape of final dataset: {final_df.shape}")


Preview of the first 5 rows and 5 columns of the data:
                       2016-01-01  2016-01-02  2016-01-03  2016-01-04  \
Latitude  Longitude                                                     
21.560428 -121.060390    1.413358    1.331079    1.376337    2.190387   
21.587541 -120.952601    1.398102    1.331326    1.389859    2.272398   
21.614538 -120.844743    1.372702    1.334618    1.404449    2.367060   
21.641420 -120.736817    1.354613    1.340229    1.406716    2.481329   
21.668187 -120.628821    1.335252    1.340366    1.415844    2.504131   

                       2016-01-05  
Latitude  Longitude                
21.560428 -121.060390    4.519111  
21.587541 -120.952601    4.704486  
21.614538 -120.844743    4.892936  
21.641420 -120.736817    4.997709  
21.668187 -120.628821    5.047138  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 137241 entries, (21.560427879385422, -121.06039038106123) to (50.33150254956958, -54.688006819308974)
Columns: 36

In [20]:
import xarray as xr
import pandas as pd
import numpy as np
from pyproj import Transformer

# Open the file
ds = xr.open_dataset('C:/Users/priom/Downloads/M2T1NXAER_5.12.4-20250117_084759/HR2DAY_LST_ACONC_EQUATES_v532_12US1_2016.nc')

# Get projection parameters from global attributes
p_alp = ds.attrs['P_ALP']  # First projection parallel
p_bet = ds.attrs['P_BET']  # Second projection parallel
p_gam = ds.attrs['P_GAM']  # Central meridian
xcent = ds.attrs['XCENT']  # Projection center longitude
ycent = ds.attrs['YCENT']  # Projection center latitude

# Create the projection transformer
proj_str = f"+proj=lcc +lat_1={p_alp} +lat_2={p_bet} +lat_0={ycent} +lon_0={xcent} +x_0=0 +y_0=0 +ellps=sphere +units=m +no_defs"
transformer = Transformer.from_crs(proj_str, "EPSG:4326", always_xy=True)

# Create coordinate arrays
x_coords = np.arange(ds.dims['COL']) * ds.attrs['XCELL'] + ds.attrs['XORIG']
y_coords = np.arange(ds.dims['ROW']) * ds.attrs['YCELL'] + ds.attrs['YORIG']

# Create meshgrid
xx, yy = np.meshgrid(x_coords, y_coords)

# Transform coordinates to lat/lon
lons, lats = transformer.transform(xx, yy)

# Washington state boundaries
wa_lat_min = 45.543541
wa_lat_max = 49.002494
wa_lon_min = -124.848974
wa_lon_max = -116.916071

# Create mask for Washington state
wa_mask = (lats >= wa_lat_min) & (lats <= wa_lat_max) & \
          (lons >= wa_lon_min) & (lons <= wa_lon_max)

# Get PM2.5 data
pm25_data = ds['PM25_AVG'].values

# Parse SDATE (format: YYYYDDD)
sdate_str = str(ds.attrs['SDATE'])
year = int(sdate_str[:4])
day_of_year = int(sdate_str[4:])
start_date = pd.Timestamp(year, 1, 1) + pd.Timedelta(days=day_of_year-1)

# Create dates list
dates = [start_date + pd.Timedelta(days=i) for i in range(ds.dims['TSTEP'])]

# Create DataFrame for each timestep
dfs = []
for t in range(len(dates)):
    # Extract PM2.5 data for this timestamp (taking first layer)
    pm25_slice = pm25_data[t, 0]
    
    # Create DataFrame
    df_t = pd.DataFrame({
        'Latitude': lats[wa_mask],
        'Longitude': lons[wa_mask],
        'PM2.5': pm25_slice[wa_mask]
    })
    
    # Remove any missing values
    df_t = df_t.dropna()
    dfs.append(df_t)

# Combine all timestamps
final_df = pd.concat([df.set_index(['Latitude', 'Longitude'])['PM2.5'] for df in dfs], axis=1)
final_df.columns = dates

# Preview the data
print("\nPreview of the first 5 rows and 5 columns of the data:")
print(final_df.iloc[:5, :5])

print("\nDataFrame Info:")
print(final_df.info())

print("\nBasic statistics of PM2.5 values:")
print(final_df.describe())

# Save to CSV
final_df.to_csv('washington_pm25_data.csv')

# Close the dataset
ds.close()

print("\nData has been processed and saved to 'washington_pm25_data.csv'")
print(f"Shape of final dataset: {final_df.shape}")


Preview of the first 5 rows and 5 columns of the data:
                       2016-01-01  2016-01-02  2016-01-03  2016-01-04  \
Latitude  Longitude                                                     
45.550694 -117.335418    2.079100    3.057056    2.835236    2.894924   
45.574528 -117.185260    1.769747    2.589481    2.957903    2.991938   
45.598188 -117.034990    2.255016    2.373785    3.058101    3.152895   
45.558569 -117.970070    6.242252    3.597657    4.047945    4.324589   
45.583136 -117.820134    6.572935    3.742129    3.636909    3.776148   

                       2016-01-05  
Latitude  Longitude                
45.550694 -117.335418    2.224461  
45.574528 -117.185260    2.674171  
45.598188 -117.034990    3.387901  
45.558569 -117.970070    4.463864  
45.583136 -117.820134    3.771707  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1618 entries, (45.55069406771397, -117.33541827668462) to (49.001797060257985, -124.61160298766463)
Columns: 366 

In [None]:
##Removing the repetation of lat, long 


import pandas as pd
import numpy as np

def process_pm25_data(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Get unique coordinates (latitude, longitude pairs)
    unique_coords = df[['Latitude', 'Longitude']].drop_duplicates().sort_values(['Latitude', 'Longitude'])
    
    # Get all unique dates
    dates = sorted(df['Date'].unique())
    
    # Create a new dataframe with latitude and longitude as the first columns
    result = pd.DataFrame(unique_coords)
    
    # Add columns for each date with corresponding PM2.5 values
    for date in dates:
        date_data = df[df['Date'] == date]
        
        # Create a mapping of coordinates to PM2.5 values for this date
        pm25_values = {}
        for _, row in date_data.iterrows():
            coord_key = (row['Latitude'], row['Longitude'])
            pm25_values[coord_key] = row['Daily Mean PM2.5 Concentration']
        
        # Add the PM2.5 values for this date as a new column
        result[date] = result.apply(
            lambda x: pm25_values.get((x['Latitude'], x['Longitude']), np.nan), 
            axis=1
        )
    
    return result

# Example usage
file_path = 'PM2.5_WA_DM.csv'
result_df = process_pm25_data(file_path)

# Save to CSV
result_df.to_csv('processed_pm25_data.csv', index=False)

# Display first few rows and columns
print("\nFirst few rows and columns of the processed data:")
print(result_df.iloc[:5, :7])  # Show first 5 rows and 7 columns

# Display shape of the resulting dataframe
print("\nShape of the processed dataframe:", result_df.shape)