In [1]:
from google.colab import drive
drive.mount('/content/drive/')

%cd '/content/drive/MyDrive/Colab Notebooks/Capstone Project'

import pandas as pd
from IPython.display import display

# Load datasets (adjust file names as needed)
df_bird = pd.read_parquet('sabina.parquet')
df_climate = pd.read_parquet('climate_merged_final.parquet')
df_land = pd.read_parquet('last_version_of_merged_data.parquet')

# Quick check: display sample rows from each dataset.
print("Bird Data Sample:")
display(df_bird.head(5))
print("Climate Data Sample:")
display(df_climate.head(5))
print("Land Data Sample:")
display(df_land.head(5))


Mounted at /content/drive/
/content/drive/MyDrive/Colab Notebooks/Capstone Project
Bird Data Sample:


Unnamed: 0,LATITUDE,LONGITUDE,OBSERVATION DATE
0,32.045333,-115.908508,1995-11
1,30.409449,-115.945759,1998-03
2,30.409449,-115.945759,1998-03
3,30.409449,-115.945759,1998-03
4,30.409449,-115.945759,1998-03


Climate Data Sample:


Unnamed: 0,Station_ID,Date,Latitude,Longitude,Elevation,State,StationName,Temp_celsius,Precip_mm,TempFlags,Precip_MeasurementFlag,Precip_QCFlag,Precip_SourceFlag,Precip_SourceIndex,Precip_FirstYear,Precip_LastYear
0,CA001011500,1961-01,48.9333,-123.75,75.0,,BC CHEMAINUS,5.01,372.3,E,,,D,23584.0,1919,2024
1,CA001011500,1962-01,48.9333,-123.75,75.0,,BC CHEMAINUS,3.53,107.4,E,,,D,23584.0,1919,2024
2,CA001011500,1963-01,48.9333,-123.75,75.0,,BC CHEMAINUS,1.55,33.5,E,,,D,23584.0,1919,2024
3,CA001011500,1964-01,48.9333,-123.75,75.0,,BC CHEMAINUS,4.56,302.5,E,,,D,23584.0,1919,2024
4,CA001011500,1965-01,48.9333,-123.75,75.0,,BC CHEMAINUS,2.73,169.4,E,,,D,23584.0,1919,2024


Land Data Sample:


Unnamed: 0,Year,Date,Latitude,Longitude,Region,Land_Cover_Type,Land_Cover_Description
0,2001,2001-01,83.0,-170.0,Canada,0,Water
1,2001,2001-01,83.0,-169.949958,Canada,0,Water
2,2001,2001-01,83.0,-169.899917,Canada,0,Water
3,2001,2001-01,83.0,-169.849875,Canada,0,Water
4,2001,2001-01,83.0,-169.799833,Canada,0,Water


In [2]:
# Standardize column names (if needed)
for df in [df_bird, df_climate, df_land]:
    df.rename(columns={'LATITUDE': 'Latitude', 'LONGITUDE': 'Longitude', 'OBSERVATION DATE': 'Date'}, inplace=True)

# Round coordinates to 4 decimals.
df_bird['Latitude'] = df_bird['Latitude'].round(4)
df_bird['Longitude'] = df_bird['Longitude'].round(4)

df_climate['Latitude'] = df_climate['Latitude'].round(4)
df_climate['Longitude'] = df_climate['Longitude'].round(4)

df_land['Latitude'] = df_land['Latitude'].round(4)
df_land['Longitude'] = df_land['Longitude'].round(4)

# Check Block 2:
print("Bird Columns:")
print(df_bird.columns.tolist())
display(df_bird[['Date','Latitude', 'Longitude']].head(5))

print("Climate Columns:")
print(df_climate.columns.tolist())
display(df_climate[['Date','Latitude', 'Longitude', 'Temp_celsius', 'Precip_mm']].head(5))

print("Land Columns:")
print(df_land.columns.tolist())
display(df_land[['Date','Latitude', 'Longitude', 'Land_Cover_Description']].head(5))


Bird Columns:
['Latitude', 'Longitude', 'Date']


Unnamed: 0,Date,Latitude,Longitude
0,1995-11,32.0453,-115.9085
1,1998-03,30.4094,-115.9458
2,1998-03,30.4094,-115.9458
3,1998-03,30.4094,-115.9458
4,1998-03,30.4094,-115.9458


Climate Columns:
['Station_ID', 'Date', 'Latitude', 'Longitude', 'Elevation', 'State', 'StationName', 'Temp_celsius', 'Precip_mm', 'TempFlags', 'Precip_MeasurementFlag', 'Precip_QCFlag', 'Precip_SourceFlag', 'Precip_SourceIndex', 'Precip_FirstYear', 'Precip_LastYear']


Unnamed: 0,Date,Latitude,Longitude,Temp_celsius,Precip_mm
0,1961-01,48.9333,-123.75,5.01,372.3
1,1962-01,48.9333,-123.75,3.53,107.4
2,1963-01,48.9333,-123.75,1.55,33.5
3,1964-01,48.9333,-123.75,4.56,302.5
4,1965-01,48.9333,-123.75,2.73,169.4


Land Columns:
['Year', 'Date', 'Latitude', 'Longitude', 'Region', 'Land_Cover_Type', 'Land_Cover_Description']


Unnamed: 0,Date,Latitude,Longitude,Land_Cover_Description
0,2001-01,83.0,-170.0,Water
1,2001-01,83.0,-169.95,Water
2,2001-01,83.0,-169.8999,Water
3,2001-01,83.0,-169.8499,Water
4,2001-01,83.0,-169.7998,Water


In [4]:
from scipy.spatial import cKDTree

# Build a KDTree for the climate dataset based on Longitude and Latitude.
climate_tree = cKDTree(df_climate[['Longitude', 'Latitude']].values)

def get_nearest_climate(row):
    """
    For a given bird record, find the nearest climate record.
    Returns a Pandas Series with the temperature and precipitation values.
    """
    # Query the nearest climate record.
    distance, idx = climate_tree.query([row['Longitude'], row['Latitude']], k=1)
    nearest = df_climate.iloc[idx]
    return pd.Series({
        'Temp_celsius': nearest['Temp_celsius'],
        'Precip_mm': nearest['Precip_mm']
    })

# Apply the nearest-neighbor lookup to each bird record.
df_bird_nn = df_bird.copy()
climate_info = df_bird_nn.apply(get_nearest_climate, axis=1)
df_bird_nn = pd.concat([df_bird_nn, climate_info], axis=1)

# Check Block 3:
print("Bird data with nearest climate values:")
display(df_bird_nn.head(5))


Bird data with nearest climate values:


Unnamed: 0,Latitude,Longitude,Date,Temp_celsius,Precip_mm
0,32.0453,-115.9085,1995-11,18.92,5.8
1,30.4094,-115.9458,1998-03,14.92,0.0
2,30.4094,-115.9458,1998-03,14.92,0.0
3,30.4094,-115.9458,1998-03,14.92,0.0
4,30.4094,-115.9458,1998-03,14.92,0.0


In [5]:
# Build a KDTree for the land dataset based on Longitude and Latitude.
land_tree = cKDTree(df_land[['Longitude', 'Latitude']].values)

def get_nearest_land(row):
    """
    For a given bird record, find the nearest land record.
    Returns the Land_Cover_Description from the nearest record.
    """
    distance, idx = land_tree.query([row['Longitude'], row['Latitude']], k=1)
    nearest = df_land.iloc[idx]
    return nearest['Land_Cover_Description']

# Apply the function to each bird record.
df_bird_nn['Land_Cover_Description'] = df_bird_nn.apply(get_nearest_land, axis=1)

# Check Block 4:
print("Bird data with nearest land cover:")
display(df_bird_nn[['Date', 'Latitude', 'Longitude', 'Land_Cover_Description']].head(5))


Bird data with nearest land cover:


Unnamed: 0,Date,Latitude,Longitude,Land_Cover_Description
0,1995-11,32.0453,-115.9085,Open Shrublands
1,1998-03,30.4094,-115.9458,Grasslands
2,1998-03,30.4094,-115.9458,Grasslands
3,1998-03,30.4094,-115.9458,Grasslands
4,1998-03,30.4094,-115.9458,Grasslands


In [6]:
# In this approach, df_bird_nn already represents your bird dataset
# enriched with the nearest climate (Temp_celsius, Precip_mm) and land (Land_Cover_Description) values.
df_merged_final = df_bird_nn.copy()

# Check Block 5:
print("Final merged dataset shape:", df_merged_final.shape)
print("Final merged dataset columns:")
print(df_merged_final.columns.tolist())
display(df_merged_final.head(10))


Final merged dataset shape: (7498836, 6)
Final merged dataset columns:
['Latitude', 'Longitude', 'Date', 'Temp_celsius', 'Precip_mm', 'Land_Cover_Description']


Unnamed: 0,Latitude,Longitude,Date,Temp_celsius,Precip_mm,Land_Cover_Description
0,32.0453,-115.9085,1995-11,18.92,5.8,Open Shrublands
1,30.4094,-115.9458,1998-03,14.92,0.0,Grasslands
2,30.4094,-115.9458,1998-03,14.92,0.0,Grasslands
3,30.4094,-115.9458,1998-03,14.92,0.0,Grasslands
4,30.4094,-115.9458,1998-03,14.92,0.0,Grasslands
5,30.4094,-115.9458,1998-03,14.92,0.0,Grasslands
6,32.0453,-115.9085,2001-03,18.92,5.8,Open Shrublands
7,32.0453,-115.9085,2002-01,18.92,5.8,Open Shrublands
8,31.767,-116.597,2010-12,15.87,0.0,Water
9,32.0453,-115.9085,2012-02,18.92,5.8,Open Shrublands


In [7]:
# Convert Date to datetime and then create a YearMonth column (monthly period)
df_merged_final['Date'] = pd.to_datetime(df_merged_final['Date'])
df_merged_final['YearMonth'] = df_merged_final['Date'].dt.to_period('M')

# Define aggregation rules. Adjust these if your column names differ.
agg_rules = {
    'Temp_celsius': 'mean',  # Average temperature over the month.
    'Precip_mm': 'sum',      # Total precipitation over the month.
    'Land_Cover_Description': 'first'  # Take the first non-null land cover (assuming consistency)
}

df_monthly = df_merged_final.groupby(['YearMonth', 'Latitude', 'Longitude'], as_index=False).agg(agg_rules)

# Optionally, convert YearMonth to string for easier viewing.
df_monthly['YearMonth'] = df_monthly['YearMonth'].astype(str)

# Check Block 6:
print("Monthly aggregated data shape:", df_monthly.shape)
display(df_monthly.head(10))

Monthly aggregated data shape: (3036868, 6)


Unnamed: 0,YearMonth,Latitude,Longitude,Temp_celsius,Precip_mm,Land_Cover_Description
0,1974-01,25.2868,-80.8986,19.98,63.3,Permanent Wetlands
1,1974-01,29.1354,-83.034,27.9,103.5,Water
2,1974-01,32.5013,-93.8096,15.76,24.5,Woody Savannas
3,1974-01,33.7386,-96.7528,5.1,36.4,Grasslands
4,1974-01,34.6154,-93.1843,7.27,257.8,Mixed Forests
5,1974-01,35.1398,-93.0542,6.94,358.8,Woody Savannas
6,1974-01,37.5822,-106.0947,12.1,11.3,Croplands
7,1974-01,39.9431,-104.7625,23.82,27.4,Grasslands
8,1974-01,41.5777,-74.7848,14.44,35.0,Deciduous Broadleaf Forest
9,1974-01,42.2825,-96.3355,15.32,101.2,Croplands


In [8]:
output_file = "/content/drive/MyDrive/Colab Notebooks/Capstone Project/harmonized_dataset_ProjectMilestone__01.csv"
df_monthly.to_csv(output_file, index=False)
print(f"Harmonized dataset saved to: {output_file}")

Harmonized dataset saved to: /content/drive/MyDrive/Colab Notebooks/Capstone Project/harmonized_dataset_ProjectMilestone__01.csv
