In [51]:
import os
import glob

import pandas as pd
import numpy as np

In [52]:
# # Load CSVs into DataFrames
# canopint_df = pd.read_csv("Dataset/CanopInt_inst_data.csv")
# esoil_df = pd.read_csv("Dataset/ESoil_tavg_data.csv")
# rainf_df = pd.read_csv("Dataset/Rainf_tavg_data.csv")
# snowf_df = pd.read_csv("Dataset/Snowf_tavg_data.csv")

# soil_moi_0_10_df = pd.read_csv("Dataset/SoilMoi0_10cm_inst_data.csv")
# soil_moi_10_40_df = pd.read_csv("Dataset/SoilMoi10_40cm_inst_data.csv")
# soil_moi_40_100_df = pd.read_csv("Dataset/SoilMoi40_100cm_inst_data.csv")
# soil_moi_100_200_df = pd.read_csv("Dataset/SoilMoi100_200cm_inst_data.csv")

# soil_tmp_0_10_df = pd.read_csv("Dataset/SoilTMP0_10cm_inst_data.csv")
# soil_tmp_10_40_df = pd.read_csv("Dataset/SoilTMP10_40cm_inst_data.csv")
# soil_tmp_40_100_df = pd.read_csv("Dataset/SoilTMP40_100cm_inst_data.csv")
# soil_tmp_100_200_df = pd.read_csv("Dataset/SoilTMP100_200cm_inst_data.csv")

# tveg_df = pd.read_csv("Dataset/TVeg_tavg_data.csv")
# tws_df = pd.read_csv("Dataset/TWS_inst_data.csv")

In [53]:
# Folder where your CSVs are stored
dataset_path = 'Dataset/'  # change to your actual folder path
csv_files = glob.glob(os.path.join(dataset_path, '*.csv'))

# List to hold all DataFrames
dfs = []

# Load and standardize each CSV
for file_path in csv_files:
    file_name = file_path.split('\\')[1].split('.')[0]
    
    if file_name == "country_latitude_longitude_area_lookup" or file_name == "Land_cover_percent_data" or file_name == "Yield_and_Production_data":
        print(pd.read_csv(file_path).shape)
        continue
        
    df = pd.read_csv(file_path)

    # Print the shape
    print(f"{file_name}: {df.shape}")
    
    # Keep only unique columns per file (other than lat/long)
    dfs.append(df)

CanopInt_inst_data: (194298, 15)
(245, 5)
ESoil_tavg_data: (194298, 15)
(644280, 20)
Rainf_tavg_data: (194298, 15)
Snowf_tavg_data: (194298, 15)
SoilMoi0_10cm_inst_data: (194298, 15)
SoilMoi100_200cm_inst_data: (194298, 15)
SoilMoi10_40cm_inst_data: (194298, 15)
SoilMoi40_100cm_inst_data: (194298, 15)
SoilTMP0_10cm_inst_data: (194298, 15)
SoilTMP100_200cm_inst_data: (194298, 15)
SoilTMP10_40cm_inst_data: (194298, 15)
SoilTMP40_100cm_inst_data: (194298, 15)
TVeg_tavg_data: (194298, 15)
TWS_inst_data: (194298, 15)
(158269, 8)


In [54]:
for df in dfs:
    # Get all columns that contain the word 'month'
    month_cols = [col for col in df.columns if 'month' in col.lower()]
    
    if len(month_cols) != 12:
        raise Exception

    col_name = month_cols[0].split('_')[0]
    
    # Compute row-wise mean
    df[f'{col_name}_mean_month'] = df[month_cols].mean(axis=1)

    df.drop(columns=month_cols, inplace=True)

In [55]:
if os.path.exists('merged_all.csv'):
    merged_df = pd.read_csv('merged_all.csv')

else:
    # Start merging
    merged_df = dfs[0]
    
    for i in range(1, len(dfs)):
        merged_df = pd.merge(
            merged_df,
            dfs[i],
            on=['longitude', 'latitude', 'year'],
            how='outer',
        )

In [56]:
land_cover_df = pd.read_csv("Dataset/Land_cover_percent_data.csv")

merged_df = pd.merge(
    merged_df,
    land_cover_df,
    on=['longitude', 'latitude', 'year'],
    how='inner',
)

In [57]:
# Save the result
# merged_df.to_csv('merged_all.csv', index=False)

In [58]:
print("✅ Merged DataFrame shape:", merged_df.shape)
print(merged_df.info())

✅ Merged DataFrame shape: (194298, 34)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194298 entries, 0 to 194297
Data columns (total 34 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   longitude                    194298 non-null  float64
 1   latitude                     194298 non-null  float64
 2   year                         194298 non-null  int64  
 3   CanopInt_mean_month          194298 non-null  float64
 4   ESoil_mean_month             194298 non-null  float64
 5   Rainf_mean_month             194298 non-null  float64
 6   Snowf_mean_month             194298 non-null  float64
 7   SoilMoi0_mean_month          194298 non-null  float64
 8   SoilMoi100_mean_month        194298 non-null  float64
 9   SoilMoi10_mean_month         194298 non-null  float64
 10  SoilMoi40_mean_month         194298 non-null  float64
 11  SoilTMP0_mean_month          194298 non-null  float64
 12  SoilTMP100_mean_mon

In [59]:
country_lookup_df = pd.read_csv("Dataset/country_latitude_longitude_area_lookup.csv")

country_lookup_df.drop(columns=['area'], inplace=True)

In [60]:
# Haversine formula to calculate the distance between two points on Earth
def haversine(lat1, lon1, lat2, lon2):
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return np.degrees(c)

In [61]:
# Function to vectorize the check for all countries
def find_country_for_point(row, country_lookup_df):
    latitudes = country_lookup_df['centroid latitude'].values
    longitudes = country_lookup_df['centroid longitude'].values
    radii = country_lookup_df['centroid radius'].values

    # Calculate the distances between the point and all country centroids
    distances = haversine(row['latitude'], row['longitude'], latitudes, longitudes)

    # Find the countries within the radius
    valid_countries = country_lookup_df[distances <= radii]
    
    if not valid_countries.empty:
        return valid_countries.iloc[0]['country']  # Assuming the first match is the correct one
    else:
        return None

# Apply the vectorized function to the dataframe
merged_df['country'] = merged_df.apply(find_country_for_point, axis=1, country_lookup_df=country_lookup_df)

# Display results
print(merged_df)

        longitude  latitude  year  CanopInt_mean_month  ESoil_mean_month  \
0          -176.5      66.5  2010             0.021199         17.154338   
1          -176.5      66.5  2011             0.020344         18.033296   
2          -176.5      66.5  2012             0.021931         19.889972   
3          -176.5      66.5  2013             0.022761         17.504125   
4          -176.5      66.5  2014             0.020658         19.887469   
...           ...       ...   ...                  ...               ...   
194293      176.5      69.5  2018             0.020494         18.554555   
194294      176.5      69.5  2019             0.022953         17.026149   
194295      176.5      69.5  2020             0.022220         17.149715   
194296      176.5      69.5  2021             0.022134         15.827755   
194297      176.5      69.5  2022             0.019612         15.948548   

        Rainf_mean_month  Snowf_mean_month  SoilMoi0_mean_month  \
0               0.00

In [62]:
total = 0
count = 0

for index, value in enumerate(merged_df['country'].values):
    if value is None:
        count += 1

    total += 1

print(f'Number of points without a country: {count}, total = {total}')

Number of points without a country: 34515, total = 194298


In [63]:
yield_production_df = pd.read_csv("Dataset/Yield_and_Production_data.csv")

In [None]:
# Create temporal aggregations
def aggregate_environmental_data(df, agg_functions=['mean', 'min', 'max']):
    # Group by country and year, then apply aggregations
    return df.groupby(['country', 'year']).agg(agg_functions)

# 4. Merge datasets
def create_feature_matrix(base_df, env_data_dict):
    result = base_df.copy()
    
    for var_name, data in env_data_dict.items():
        agg_data = aggregate_environmental_data(data)
        result = result.merge(agg_data, on=['country', 'year'], how='left')
    
    return result

In [None]:
class Country:
    def __init__(self, name, centroid_lat, centroid_lon, radius):
        self.name = name
        self.centroid_lat = centroid_lat
        self.centroid_lon = centroid_lon
        self.radius = radius  # Radius of the country in kilometers

    def is_point_inside(self, lat, lon):
        # Calculate the distance from the point to the country's centroid
        distance = haversine(self.centroid_lat, self.centroid_lon, lat, lon)

        # Radius of Earth in kilometers (mean radius)
        R = 6371.0
        print(f"Distance: {round(R * np.radians(distance), 1)} km")

        print(f"Radius: {round(self.radius * 100, 1)} km")
        
        # Check if the point is inside the country (distance <= radius)
        return distance <= self.radius

In [None]:
# Example usage
country = Country("Afghanistan", 33.93911, 67.709953, 4.555948789)
point_lat = 35.93911  # Example point latitude
point_lon = 67.709953  # Example point longitude

# Check if the point is inside the country
if country.is_point_inside(point_lat, point_lon):
    print("The point is inside the country!")
else:
    print("The point is outside the country.")