In [1]:
# To display full output in Notebook, instead of only the last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd

# Preprocessing

In [4]:
# Load and view data frame
df = pd.read_csv('NASA_data_bogura.csv')
df.head(5)

Unnamed: 0,YEAR,DOY,ALLSKY_SFC_PAR_TOT,ALLSKY_SFC_UVA,ALLSKY_SFC_UVB,ALLSKY_SFC_UV_INDEX,GWETTOP,GWETROOT,GWETPROF
0,2020,296,4.7,0.66,0.02,1.05,0.89,0.93,0.89
1,2020,297,2.61,0.39,0.01,0.63,0.93,0.96,0.92
2,2020,298,2.37,0.34,0.01,0.54,0.93,0.99,0.95
3,2020,299,5.56,0.7,0.02,1.09,0.92,0.98,0.95
4,2020,300,6.82,0.82,0.02,1.18,0.92,0.97,0.95


In [5]:
# Rename columns
df.rename(columns={
    'ALLSKY_SFC_PAR_TOT': 'PAR',
    'ALLSKY_SFC_UVA': 'UVA',
    'ALLSKY_SFC_UVB': 'UVB',
    'ALLSKY_SFC_UV_INDEX': 'UV_idx',
    'GWETTOP': 'gwet_top',
    'GWETROOT': 'gwet_root',
    'GWETPROF': 'gwet_prof'
}, inplace=True)

In [6]:
# Check data types and missing values 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   YEAR       730 non-null    int64  
 1   DOY        730 non-null    int64  
 2   PAR        730 non-null    float64
 3   UVA        730 non-null    float64
 4   UVB        730 non-null    float64
 5   UV_idx     730 non-null    float64
 6   gwet_top   730 non-null    float64
 7   gwet_root  730 non-null    float64
 8   gwet_prof  730 non-null    float64
dtypes: float64(7), int64(2)
memory usage: 51.5 KB


In [7]:
# Convert YEAR and DOY to datetime
df['DATE'] = pd.to_datetime(
    df['YEAR'].astype(str) + ' ' + 
    df['DOY'].astype(str), 
    format='%Y %j'
)

# Show result
print(df[['YEAR', 'DOY', 'DATE']].head())
df.info()

   YEAR  DOY       DATE
0  2020  296 2020-10-22
1  2020  297 2020-10-23
2  2020  298 2020-10-24
3  2020  299 2020-10-25
4  2020  300 2020-10-26
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   YEAR       730 non-null    int64         
 1   DOY        730 non-null    int64         
 2   PAR        730 non-null    float64       
 3   UVA        730 non-null    float64       
 4   UVB        730 non-null    float64       
 5   UV_idx     730 non-null    float64       
 6   gwet_top   730 non-null    float64       
 7   gwet_root  730 non-null    float64       
 8   gwet_prof  730 non-null    float64       
 9   DATE       730 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(7), int64(2)
memory usage: 57.2 KB


In [8]:
# Create season variable
def assign_season(date: pd.Timestamp) -> str:
    """
    Map a date to a custom seasonal label such as '2021S1', '2021S2', etc.

    Season boundaries (inclusive):
        S1: 22 Oct (previous year) – 25 Jan (current year)
        S2: 26 Jan – 25 Apr
        S3: 26 Apr – 31 Jul
        S4: 01 Aug – 21 Oct

    Notes
    -----
    * 22 Oct belongs to S1 of the *next* calendar year (e.g. 2020-10-22 → 2021S1).
    * 21 Oct belongs to S4 of the *same* calendar year.
    * Therefore 2022-10-22 will be labelled '2023S1'. 
      Filter it out later if you only need data up to 2022S4.
    """
    m, d, y = date.month, date.day, date.year

    # S1 (cross-year period)
    if (m == 10 and d >= 22) or m in (11, 12):
        return f"{y + 1}S1"
    if m == 1 and d <= 25:
        return f"{y}S1"

    # S2
    if (m == 1 and d >= 26) or m in (2, 3) or (m == 4 and d <= 25):
        return f"{y}S2"

    # S3
    if (m == 4 and d >= 26) or m in (5, 6) or (m == 7 and d <= 31):
        return f"{y}S3"

    # S4
    if m == 8 or m == 9 or (m == 10 and d <= 21):
        return f"{y}S4"

    # Any date outside the defined windows (unlikely) → NaN
    return pd.NA

# Add the new 'season' column to your DataFrame
df["season"] = df["DATE"].apply(assign_season)

In [9]:
# Quick sanity check
print(df[['YEAR', 'DOY', 'DATE', 'season']].head())
print(df["season"].value_counts(dropna=False).sort_index())

   YEAR  DOY       DATE  season
0  2020  296 2020-10-22  2021S1
1  2020  297 2020-10-23  2021S1
2  2020  298 2020-10-24  2021S1
3  2020  299 2020-10-25  2021S1
4  2020  300 2020-10-26  2021S1
season
2021S1    96
2021S2    90
2021S3    97
2021S4    82
2022S1    96
2022S2    90
2022S3    97
2022S4    82
Name: count, dtype: int64


# Feature engineering

In [11]:
# Add new daily calculation columns
df['PAR_x_gwet_root'] = df['PAR'] * df['gwet_root']
df['UVB_div_gwet_top'] = df['UVB'] / df['gwet_top']
df['gwet_gradient'] = df['gwet_top'] - df['gwet_root']
df['PAR_active_day_C3'] = (df['PAR'] > 3.48).astype(int)   # Threshold for C3 crops, e.g., potatoes and rices
df['PAR_active_day_C4'] = (df['PAR'] > 4.35).astype(int)   # Threshold for C4 crops, e.g., maize
df['UVA_UVB_ratio'] = df['UVA'] / df['UVB']
df['PAR_fraction'] = df['PAR'] / (df['PAR'] + df['UVA'] + df['UVB'])

# Replace infinite values with NaN
df['UVB_div_gwet_top'] = df['UVB_div_gwet_top'].replace([np.inf, -np.inf], np.nan)
df['UVA_UVB_ratio'] = df['UVA_UVB_ratio'].replace([np.inf, -np.inf], np.nan)


In [12]:
# Extreme events thresholds
dry_threshold = 0.3      # May adjust later according to domain and literature
high_UV_threshold = 2    # WHO high UV index threshold; may be adjusted later

# Binary indicators for extreme days
df['dry_day'] = (df['gwet_top'] < dry_threshold).astype(int)
df['high_UV_day'] = (df['UV_idx'] > high_UV_threshold).astype(int)

In [13]:
# Function to calculate maximum consecutive dry days
def max_consecutive_ones(s):
    max_streak = current_streak = 0
    for val in s:
        if val == 1:
            current_streak += 1
        else:
            if current_streak > max_streak:
                max_streak = current_streak
            current_streak = 0
    return max(max_streak, current_streak)


In [14]:
# Define aggregation functions
def percentile(n):
    def percentile_(x):
        return np.nanpercentile(x, n)
    percentile_.__name__ = 'p%s' % n
    return percentile_

In [15]:
# Ensure data is sorted by date for delta calculations
df = df.sort_values('DATE')

# Group by season and aggregate
summary = df.groupby('season').agg(
    # Original aggregations
    PAR_total=('PAR', 'sum'),
    PAR_mean=('PAR', 'mean'),
    PAR_std=('PAR', 'std'),
    UVA_total=('UVA', 'sum'),
    UVA_mean=('UVA', 'mean'),
    UVA_std=('UVA', 'std'),
    UVB_total=('UVB', 'sum'),
    UVB_mean=('UVB', 'mean'),
    UVB_std=('UVB', 'std'),
    UV_idx_mean=('UV_idx', 'mean'),
    UV_idx_median=('UV_idx', 'median'),
    UV_idx_std=('UV_idx', 'std'),
    gwet_top_mean=('gwet_top', 'mean'),
    gwet_top_median=('gwet_top', 'median'),
    gwet_top_std=('gwet_top', 'std'),
    gwet_top_95p=('gwet_top', percentile(95)),
    gwet_top_5p=('gwet_top', percentile(5)),
    gwet_root_mean=('gwet_root', 'mean'),
    gwet_root_median=('gwet_root', 'median'),
    gwet_root_std=('gwet_root', 'std'),
    gwet_root_95p=('gwet_root', percentile(95)),
    gwet_root_5p=('gwet_root', percentile(5)),
    gwet_prof_mean=('gwet_prof', 'mean'),
    gwet_prof_median=('gwet_prof', 'median'),
    gwet_prof_std=('gwet_prof', 'std'),
    gwet_prof_95p=('gwet_prof', percentile(95)),
    gwet_prof_5p=('gwet_prof', percentile(5)),
    
    # New feature aggregations
    # Interaction metrics
    PAR_x_gwet_root_mean=('PAR_x_gwet_root', 'mean'),
    PAR_x_gwet_root_std=('PAR_x_gwet_root', 'std'),
    UVB_div_gwet_top_mean=('UVB_div_gwet_top', 'mean'),
    UVB_div_gwet_top_std=('UVB_div_gwet_top', 'std'),
    
    # Extreme events
    dry_days_count=('dry_day', 'sum'),
    max_consecutive_dry_days=('dry_day', max_consecutive_ones),
    high_UV_days_count=('high_UV_day', 'sum'),
    total_days=('DATE', 'count'),
    
    # Soil moisture metrics
    gwet_gradient_mean=('gwet_gradient', 'mean'),
    gwet_gradient_std=('gwet_gradient', 'std'),
    
    # Radiation metrics
    C3_PAR_active_days_count=('PAR_active_day_C3', 'sum'),
    C4_PAR_active_days_count=('PAR_active_day_C4', 'sum'),
    UVA_UVB_ratio_mean=('UVA_UVB_ratio', 'mean'),
    UVA_UVB_ratio_median=('UVA_UVB_ratio', 'median'),
    UVA_UVB_ratio_std=('UVA_UVB_ratio', 'std'),
    PAR_fraction_mean=('PAR_fraction', 'mean'),
    PAR_fraction_median=('PAR_fraction', 'median'),
    PAR_fraction_std=('PAR_fraction', 'std')
).reset_index()

summary['dry_days_pct'] = summary['dry_days_count'] / summary['total_days']
summary['high_UV_days_pct'] = summary['high_UV_days_count'] / summary['total_days']
summary['C3_PAR_active_days_pct'] = summary['C3_PAR_active_days_count'] / summary['total_days']
summary['C4_PAR_active_days_pct'] = summary['C4_PAR_active_days_count'] / summary['total_days']
summary = summary.drop(columns=['dry_days_count', 'high_UV_days_count', 
                                'C3_PAR_active_days_count', 'C4_PAR_active_days_count', 'total_days'])

In [16]:
# Display the enhanced summary
pd.set_option('display.max_columns', None)
summary
pd.reset_option('display.max_columns')

Unnamed: 0,season,PAR_total,PAR_mean,PAR_std,UVA_total,UVA_mean,UVA_std,UVB_total,UVB_mean,UVB_std,UV_idx_mean,UV_idx_median,UV_idx_std,gwet_top_mean,gwet_top_median,gwet_top_std,gwet_top_95p,gwet_top_5p,gwet_root_mean,gwet_root_median,gwet_root_std,gwet_root_95p,gwet_root_5p,gwet_prof_mean,gwet_prof_median,gwet_prof_std,gwet_prof_95p,gwet_prof_5p,PAR_x_gwet_root_mean,PAR_x_gwet_root_std,UVB_div_gwet_top_mean,UVB_div_gwet_top_std,max_consecutive_dry_days,gwet_gradient_mean,gwet_gradient_std,UVA_UVB_ratio_mean,UVA_UVB_ratio_median,UVA_UVB_ratio_std,PAR_fraction_mean,PAR_fraction_median,PAR_fraction_std,dry_days_pct,high_UV_days_pct,C3_PAR_active_days_pct,C4_PAR_active_days_pct
0,2021S1,508.03,5.291979,1.735873,58.64,0.610833,0.227738,1.26,0.013125,0.005678,0.756562,0.745,0.316041,0.769479,0.77,0.088191,0.91,0.6275,0.767917,0.755,0.114652,0.96,0.61,0.743438,0.73,0.108646,0.93,0.59,4.165724,1.786171,0.016798,0.006299,0,0.001563,0.029605,47.071429,44.5,11.987162,0.896114,0.895662,0.008837,0.0,0.0,0.822917,0.677083
1,2021S2,689.25,7.658333,1.559412,79.65,0.885,0.207918,1.95,0.021667,0.00768,1.252111,1.215,0.397977,0.425111,0.42,0.089997,0.58,0.29,0.463222,0.44,0.058977,0.57,0.4,0.459333,0.44,0.055522,0.56,0.4,3.488213,0.539134,0.055517,0.02672,4,-0.038111,0.037861,43.540741,40.916667,9.783058,0.894801,0.894044,0.004172,0.066667,0.033333,1.0,1.0
2,2021S3,771.49,7.953505,2.045928,97.7,1.007216,0.26201,2.75,0.028351,0.0085,1.659175,1.69,0.470692,0.760515,0.83,0.157754,0.92,0.404,0.759485,0.82,0.176437,0.97,0.42,0.733814,0.79,0.166967,0.932,0.42,5.946802,1.98576,0.039784,0.016542,0,0.001031,0.044942,36.393471,36.0,5.338929,0.884663,0.884679,0.006845,0.0,0.226804,0.969072,0.917526
3,2021S4,618.66,7.544634,1.733994,81.01,0.987927,0.218958,2.26,0.027561,0.00695,1.630244,1.68,0.384971,0.921463,0.92,0.016187,0.94,0.8905,0.970976,0.97,0.015761,0.99,0.9405,0.942317,0.945,0.017729,0.9695,0.91,7.319278,1.659889,0.029938,0.007606,0,-0.049512,0.005859,36.468496,36.0,4.994864,0.881024,0.880804,0.004596,0.0,0.146341,0.97561,0.939024
4,2022S1,554.94,5.780625,1.247179,65.33,0.680521,0.146362,1.29,0.013437,0.004775,0.832292,0.8,0.22232,0.78125,0.78,0.075117,0.91,0.6775,0.778125,0.77,0.104088,0.9625,0.65,0.753021,0.74,0.099696,0.9325,0.63,4.579983,1.459168,0.016967,0.004961,0,0.003125,0.030308,53.619792,54.5,11.962385,0.892593,0.89295,0.004081,0.0,0.0,0.947917,0.90625
5,2022S2,659.16,7.324,1.758683,76.76,0.852889,0.220771,1.84,0.020444,0.00733,1.165,1.2,0.366598,0.518444,0.485,0.102573,0.6755,0.4,0.523889,0.505,0.075218,0.64,0.44,0.516222,0.5,0.069243,0.62,0.4345,3.781238,0.887659,0.042477,0.019278,0,-0.005444,0.03247,43.985019,42.0,9.995095,0.893799,0.89256,0.004385,0.0,0.011111,0.966667,0.944444
6,2022S3,782.21,8.064021,2.142439,98.5,1.015464,0.290861,2.77,0.028557,0.009241,1.659175,1.58,0.543184,0.755876,0.8,0.134387,0.89,0.468,0.750928,0.78,0.143702,0.92,0.478,0.725258,0.76,0.135448,0.89,0.47,6.062856,2.083633,0.03922,0.014724,0,0.004948,0.033139,36.292955,35.25,5.393949,0.885608,0.88563,0.007635,0.0,0.268041,0.979381,0.938144
7,2022S4,641.72,7.825854,1.827892,84.17,1.026463,0.225412,2.38,0.029024,0.007134,1.69378,1.68,0.405945,0.865854,0.87,0.039751,0.92,0.8,0.890854,0.895,0.060332,0.9695,0.8,0.859024,0.86,0.060708,0.9395,0.77,6.945784,1.588848,0.033766,0.009108,0,-0.025,0.023054,35.767276,35.333333,4.603179,0.880437,0.880679,0.004883,0.0,0.231707,0.963415,0.963415


In [17]:
# Label district information for the summary set
summary.insert(0, 'location', 'Bogura')

In [18]:
# Check insert result
summary.head()

Unnamed: 0,location,season,PAR_total,PAR_mean,PAR_std,UVA_total,UVA_mean,UVA_std,UVB_total,UVB_mean,...,UVA_UVB_ratio_mean,UVA_UVB_ratio_median,UVA_UVB_ratio_std,PAR_fraction_mean,PAR_fraction_median,PAR_fraction_std,dry_days_pct,high_UV_days_pct,C3_PAR_active_days_pct,C4_PAR_active_days_pct
0,Bogura,2021S1,508.03,5.291979,1.735873,58.64,0.610833,0.227738,1.26,0.013125,...,47.071429,44.5,11.987162,0.896114,0.895662,0.008837,0.0,0.0,0.822917,0.677083
1,Bogura,2021S2,689.25,7.658333,1.559412,79.65,0.885,0.207918,1.95,0.021667,...,43.540741,40.916667,9.783058,0.894801,0.894044,0.004172,0.066667,0.033333,1.0,1.0
2,Bogura,2021S3,771.49,7.953505,2.045928,97.7,1.007216,0.26201,2.75,0.028351,...,36.393471,36.0,5.338929,0.884663,0.884679,0.006845,0.0,0.226804,0.969072,0.917526
3,Bogura,2021S4,618.66,7.544634,1.733994,81.01,0.987927,0.218958,2.26,0.027561,...,36.468496,36.0,4.994864,0.881024,0.880804,0.004596,0.0,0.146341,0.97561,0.939024
4,Bogura,2022S1,554.94,5.780625,1.247179,65.33,0.680521,0.146362,1.29,0.013437,...,53.619792,54.5,11.962385,0.892593,0.89295,0.004081,0.0,0.0,0.947917,0.90625


In [19]:
# Export processed data frame to CSV
summary.to_csv('NASA_summary_bogura.csv', index=False)

In [20]:
# Generate a data dictionary
def create_data_dictionary(df, filename="NASA_data_data_dictionary.xlsx"):
    """
    Generate and export detailed data dictionary to Excel
    
    Parameters:
    df: Input DataFrame
    filename: Output Excel filename
    """
    # Create base dictionary
    data_dict = pd.DataFrame({
        'Variable Name': df.columns,
        'Data Type': df.dtypes.values,
        'Non-Null Count': df.count().values,
        'Description': [''] * len(df.columns),  # Placeholder for explanations
        'Source': ['NASA'] * len(df.columns)
    })
    
    # Add metadata
    data_dict['Unique Values'] = df.nunique().values
    
    # Initialize additional columns
    data_dict['Min Value'] = np.nan
    data_dict['Max Value'] = np.nan
    data_dict['Range'] = ''
    data_dict['Sample Values'] = ''
    
    # Process columns
    for idx, col in enumerate(df.columns):
        if pd.api.types.is_numeric_dtype(df[col]):
            min_val = df[col].min()
            max_val = df[col].max()
            data_dict.at[idx, 'Min Value'] = min_val
            data_dict.at[idx, 'Max Value'] = max_val
            data_dict.at[idx, 'Range'] = f"{min_val:.4f} to {max_val:.4f}"
        else:
            unique_vals = df[col].unique()
            sample = ', '.join(map(str, unique_vals[:min(3, len(unique_vals))]))
            if len(unique_vals) > 3:
                sample += ', ...'
            data_dict.at[idx, 'Sample Values'] = sample
    
    # Add automatic categorization
    data_dict['Category'] = '—'
    categories = {
        'radiation': ['PAR', 'UV'],
        'gwet': ['gwet'],
        'temperature': ['temp'],
        'gdd': ['gdd'],
        'thi': ['thi'],
        'humidity': ['rh', 'humidity'],
        'vpd': ['vpd'],
        'rainfall': ['rain', 'dry_days', 'wet_days'],
        'wind': ['wind', 'calm', '/']
    }
    
    for idx, col_name in enumerate(df.columns):
        col_lower = col_name.lower()
        for category, keywords in categories.items():
            if any(kw.lower() in col_lower for kw in keywords):
                data_dict.at[idx, 'Category'] = category
                break
    
    # Reorder columns
    column_order = [
        'Variable Name', 'Category', 'Description', 'Data Type', 
        'Non-Null Count', 'Source', 'Unique Values', 
        'Min Value', 'Max Value', 'Range', 'Sample Values'
    ]
    data_dict = data_dict[column_order]
    
    # Export to Excel
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        data_dict.to_excel(writer, index=False, sheet_name='Data Dictionary')
        
        # Add category legend to second sheet
        legend = pd.DataFrame({
            'Category': list(categories.keys()),
            'Keywords': [', '.join(kws) for kws in categories.values()]
        })
        legend.to_excel(writer, index=False, sheet_name='Category Legend')
    
    print(f"Data dictionary exported to: {filename}")
    return data_dict

# Usage
create_data_dictionary(df)

Data dictionary exported to: NASA_data_data_dictionary.xlsx


Unnamed: 0,Variable Name,Category,Description,Data Type,Non-Null Count,Source,Unique Values,Min Value,Max Value,Range,Sample Values
0,YEAR,—,,int64,730,NASA,3,2020.0,2022.0,2020.0000 to 2022.0000,
1,DOY,—,,int64,730,NASA,366,1.0,366.0,1.0000 to 366.0000,
2,PAR,radiation,,float64,730,NASA,460,1.41,12.39,1.4100 to 12.3900,
3,UVA,radiation,,float64,730,NASA,130,0.15,1.62,0.1500 to 1.6200,
4,UVB,radiation,,float64,730,NASA,6,0.0,0.05,0.0000 to 0.0500,
5,UV_idx,radiation,,float64,730,NASA,213,0.17,2.89,0.1700 to 2.8900,
6,gwet_top,gwet,,float64,730,NASA,68,0.28,0.95,0.2800 to 0.9500,
7,gwet_root,gwet,,float64,730,NASA,60,0.4,0.99,0.4000 to 0.9900,
8,gwet_prof,gwet,,float64,730,NASA,58,0.4,0.97,0.4000 to 0.9700,
9,DATE,—,,datetime64[ns],730,NASA,730,,,,"2020-10-22 00:00:00, 2020-10-23 00:00:00, 2020..."
