In [1]:
# To display full output in Notebook, instead of only the last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Import libraries
import numpy as np
import pandas as pd

In [3]:
# Load the data frames
summary_nasa = pd.read_csv("NASA_summary_all.csv")
summary_bmd = pd.read_csv("BMD_summary.csv")

In [4]:
# Get columns excluding keys
nasa_cols = set(summary_nasa.columns) - {'location', 'season'}
bmd_cols = set(summary_bmd.columns) - {'location', 'season'}

# Check for overlapping columns
overlap = nasa_cols & bmd_cols
print(f"Overlapping columns: {overlap if overlap else 'None'}")

Overlapping columns: None


In [5]:
# Standardize location name in BMD data
summary_bmd['location'] = summary_bmd['location'].replace(
    {'Mymensingh': 'Muktagacha'}
)

In [6]:
# Full join
combined = pd.merge(
    summary_nasa,
    summary_bmd,
    on=['location', 'season'],
    how='outer',  # Full outer join
    indicator=True  # Show source of each row
)

In [7]:
# Check results
print(f"Combined shape: {combined.shape}")
print(combined['_merge'].value_counts())  # Show left/right/both counts
combined.head()

Combined shape: (24, 100)
_merge
both          24
left_only      0
right_only     0
Name: count, dtype: int64


Unnamed: 0,location,season,PAR_total,PAR_mean,PAR_std,UVA_total,UVA_mean,UVA_std,UVB_total,UVB_mean,...,thi_moderate_pct,wind_speed_mean,wind_speed_std,calm_days_pct,S_SE_pct,N_NW_pct,W_NW_pct,S_SW_pct,N_NE_pct,_merge
0,Bogura,2021S1,508.03,5.291979,1.735873,58.64,0.610833,0.227738,1.26,0.013125,...,0.052083,1.843731,1.13255,0.677,0.135417,0.864583,0.0,0.0,0.0,both
1,Bogura,2021S2,689.25,7.658333,1.559412,79.65,0.885,0.207918,1.95,0.021667,...,0.088889,2.865687,1.52886,0.333,0.033333,0.522222,0.444444,0.0,0.0,both
2,Bogura,2021S3,771.49,7.953505,2.045928,97.7,1.007216,0.26201,2.75,0.028351,...,0.783505,3.980412,2.071062,0.134,0.0,0.0,0.350515,0.649485,0.0,both
3,Bogura,2021S4,618.66,7.544634,1.733994,81.01,0.987927,0.218958,2.26,0.027561,...,0.963415,3.450356,2.300726,0.232,0.0,0.0,0.0,1.0,0.0,both
4,Bogura,2022S1,554.94,5.780625,1.247179,65.33,0.680521,0.146362,1.29,0.013437,...,0.0,1.897917,1.514837,0.656,0.0,0.4375,0.0,0.09375,0.46875,both


In [8]:
# Drop the '_merge' column
combined = combined.drop(columns='_merge')

In [9]:
combined.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 99 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   location                      24 non-null     object 
 1   season                        24 non-null     object 
 2   PAR_total                     24 non-null     float64
 3   PAR_mean                      24 non-null     float64
 4   PAR_std                       24 non-null     float64
 5   UVA_total                     24 non-null     float64
 6   UVA_mean                      24 non-null     float64
 7   UVA_std                       24 non-null     float64
 8   UVB_total                     24 non-null     float64
 9   UVB_mean                      24 non-null     float64
 10  UVB_std                       24 non-null     float64
 11  UV_idx_mean                   24 non-null     float64
 12  UV_idx_median                 24 non-null     float64
 13  UV_idx_

In [10]:
# Export to CSV
combined.to_csv('environmental_features.csv', index=False)

In [11]:
# Generate a data dictionary
def create_data_dictionary(df, filename="environmental_features_data_dictionary.xlsx"):
    """
    Generate and export detailed data dictionary to Excel
    
    Parameters:
    df: Input DataFrame
    filename: Output Excel filename
    """
    # Create base dictionary
    data_dict = pd.DataFrame({
        'Variable Name': df.columns,
        'Data Type': df.dtypes.values,
        'Non-Null Count': df.count().values,
        'Description': [''] * len(df.columns),  # Placeholder for explanations
        'Source': ['NASA/BMD'] * len(df.columns)
    })
    
    # Add metadata
    data_dict['Unique Values'] = df.nunique().values
    
    # Initialize additional columns
    data_dict['Min Value'] = np.nan
    data_dict['Max Value'] = np.nan
    data_dict['Range'] = ''
    data_dict['Sample Values'] = ''
    
    # Process columns
    for idx, col in enumerate(df.columns):
        if pd.api.types.is_numeric_dtype(df[col]):
            min_val = df[col].min()
            max_val = df[col].max()
            data_dict.at[idx, 'Min Value'] = min_val
            data_dict.at[idx, 'Max Value'] = max_val
            data_dict.at[idx, 'Range'] = f"{min_val:.4f} to {max_val:.4f}"
        else:
            unique_vals = df[col].unique()
            sample = ', '.join(map(str, unique_vals[:min(3, len(unique_vals))]))
            if len(unique_vals) > 3:
                sample += ', ...'
            data_dict.at[idx, 'Sample Values'] = sample
    
    # Add automatic categorization
    data_dict['Category'] = '—'
    categories = {
        'radiation': ['PAR', 'UV'],
        'gwet': ['gwet'],
        'temperature': ['temp'],
        'gdd': ['gdd'],
        'thi': ['thi'],
        'humidity': ['rh', 'humidity'],
        'vpd': ['vpd'],
        'rainfall': ['rain', 'dry_days', 'wet_days'],
        'wind': ['wind', 'calm', 'SE', 'NW', 'SW', 'NE']
    }
    
    for idx, col_name in enumerate(df.columns):
        col_lower = col_name.lower()
        for category, keywords in categories.items():
            if any(kw.lower() in col_lower for kw in keywords):
                data_dict.at[idx, 'Category'] = category
                break
    
    # Reorder columns
    column_order = [
        'Variable Name', 'Category', 'Description', 'Data Type', 
        'Non-Null Count', 'Source', 'Unique Values', 
        'Min Value', 'Max Value', 'Range', 'Sample Values'
    ]
    data_dict = data_dict[column_order]
    
    # Export to Excel
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        data_dict.to_excel(writer, index=False, sheet_name='Data Dictionary')
        
        # Add category legend to second sheet
        legend = pd.DataFrame({
            'Category': list(categories.keys()),
            'Keywords': [', '.join(kws) for kws in categories.values()]
        })
        legend.to_excel(writer, index=False, sheet_name='Category Legend')
    
    print(f"Data dictionary exported to: {filename}")
    return data_dict

# Usage
create_data_dictionary(combined)

Data dictionary exported to: environmental_features_data_dictionary.xlsx


Unnamed: 0,Variable Name,Category,Description,Data Type,Non-Null Count,Source,Unique Values,Min Value,Max Value,Range,Sample Values
0,location,—,,object,24,NASA/BMD,3,,,,"Bogura, Cumilla, Muktagacha"
1,season,wind,,object,24,NASA/BMD,8,,,,"2021S1, 2021S2, 2021S3, ..."
2,PAR_total,radiation,,float64,24,NASA/BMD,24,508.030000,810.570000,508.0300 to 810.5700,
3,PAR_mean,radiation,,float64,24,NASA/BMD,24,5.291979,8.690222,5.2920 to 8.6902,
4,PAR_std,radiation,,float64,24,NASA/BMD,24,1.246466,2.446593,1.2465 to 2.4466,
...,...,...,...,...,...,...,...,...,...,...,...
94,S_SE_pct,wind,,float64,24,NASA/BMD,3,0.000000,0.135417,0.0000 to 0.1354,
95,N_NW_pct,wind,,float64,24,NASA/BMD,6,0.000000,1.000000,0.0000 to 1.0000,
96,W_NW_pct,wind,,float64,24,NASA/BMD,4,0.000000,0.444444,0.0000 to 0.4444,
97,S_SW_pct,wind,,float64,24,NASA/BMD,6,0.000000,1.000000,0.0000 to 1.0000,
