In [1]:
# To display full output in Notebook, instead of only the last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd

# Preprocessing

In [4]:
# Load and view data frame
df = pd.read_csv('NASA_data_muktagacha.csv')
df.head(5)

Unnamed: 0,YEAR,DOY,ALLSKY_SFC_PAR_TOT,ALLSKY_SFC_UVA,ALLSKY_SFC_UVB,ALLSKY_SFC_UV_INDEX,GWETTOP,GWETROOT,GWETPROF
0,2020,296,2.97,0.44,0.01,0.73,0.9,0.92,0.88
1,2020,297,1.46,0.22,0.01,0.35,0.95,0.95,0.9
2,2020,298,3.07,0.43,0.01,0.7,0.94,0.99,0.95
3,2020,299,5.38,0.68,0.02,1.05,0.94,0.98,0.95
4,2020,300,6.81,0.82,0.02,1.18,0.93,0.97,0.94


In [5]:
# Rename columns
df.rename(columns={
    'ALLSKY_SFC_PAR_TOT': 'PAR',
    'ALLSKY_SFC_UVA': 'UVA',
    'ALLSKY_SFC_UVB': 'UVB',
    'ALLSKY_SFC_UV_INDEX': 'UV_idx',
    'GWETTOP': 'gwet_top',
    'GWETROOT': 'gwet_root',
    'GWETPROF': 'gwet_prof'
}, inplace=True)

In [6]:
# Check data types and missing values 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   YEAR       730 non-null    int64  
 1   DOY        730 non-null    int64  
 2   PAR        730 non-null    float64
 3   UVA        730 non-null    float64
 4   UVB        730 non-null    float64
 5   UV_idx     730 non-null    float64
 6   gwet_top   730 non-null    float64
 7   gwet_root  730 non-null    float64
 8   gwet_prof  730 non-null    float64
dtypes: float64(7), int64(2)
memory usage: 51.5 KB


In [7]:
# Convert YEAR and DOY to datetime
df['DATE'] = pd.to_datetime(
    df['YEAR'].astype(str) + ' ' + 
    df['DOY'].astype(str), 
    format='%Y %j'
)

# Show result
print(df[['YEAR', 'DOY', 'DATE']].head())
df.info()

   YEAR  DOY       DATE
0  2020  296 2020-10-22
1  2020  297 2020-10-23
2  2020  298 2020-10-24
3  2020  299 2020-10-25
4  2020  300 2020-10-26
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   YEAR       730 non-null    int64         
 1   DOY        730 non-null    int64         
 2   PAR        730 non-null    float64       
 3   UVA        730 non-null    float64       
 4   UVB        730 non-null    float64       
 5   UV_idx     730 non-null    float64       
 6   gwet_top   730 non-null    float64       
 7   gwet_root  730 non-null    float64       
 8   gwet_prof  730 non-null    float64       
 9   DATE       730 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(7), int64(2)
memory usage: 57.2 KB


In [8]:
# Create season variable
def assign_season(date: pd.Timestamp) -> str:
    """
    Map a date to a custom seasonal label such as '2021S1', '2021S2', etc.

    Season boundaries (inclusive):
        S1: 22 Oct (previous year) – 25 Jan (current year)
        S2: 26 Jan – 25 Apr
        S3: 26 Apr – 31 Jul
        S4: 01 Aug – 21 Oct

    Notes
    -----
    * 22 Oct belongs to S1 of the *next* calendar year (e.g. 2020-10-22 → 2021S1).
    * 21 Oct belongs to S4 of the *same* calendar year.
    * Therefore 2022-10-22 will be labelled '2023S1'. 
      Filter it out later if you only need data up to 2022S4.
    """
    m, d, y = date.month, date.day, date.year

    # S1 (cross-year period)
    if (m == 10 and d >= 22) or m in (11, 12):
        return f"{y + 1}S1"
    if m == 1 and d <= 25:
        return f"{y}S1"

    # S2
    if (m == 1 and d >= 26) or m in (2, 3) or (m == 4 and d <= 25):
        return f"{y}S2"

    # S3
    if (m == 4 and d >= 26) or m in (5, 6) or (m == 7 and d <= 31):
        return f"{y}S3"

    # S4
    if m == 8 or m == 9 or (m == 10 and d <= 21):
        return f"{y}S4"

    # Any date outside the defined windows (unlikely) → NaN
    return pd.NA

# Add the new 'season' column to your DataFrame
df["season"] = df["DATE"].apply(assign_season)

In [9]:
# Quick sanity check
print(df[['YEAR', 'DOY', 'DATE', 'season']].head())
print(df["season"].value_counts(dropna=False).sort_index())

   YEAR  DOY       DATE  season
0  2020  296 2020-10-22  2021S1
1  2020  297 2020-10-23  2021S1
2  2020  298 2020-10-24  2021S1
3  2020  299 2020-10-25  2021S1
4  2020  300 2020-10-26  2021S1
season
2021S1    96
2021S2    90
2021S3    97
2021S4    82
2022S1    96
2022S2    90
2022S3    97
2022S4    82
Name: count, dtype: int64


# Feature engineering

In [11]:
# Add new daily calculation columns
df['PAR_x_gwet_root'] = df['PAR'] * df['gwet_root']
df['UVB_div_gwet_top'] = df['UVB'] / df['gwet_top']
df['gwet_gradient'] = df['gwet_top'] - df['gwet_root']
df['PAR_active_day_C3'] = (df['PAR'] > 3.48).astype(int)   # Threshold for C3 crops, e.g., potatoes and rices
df['PAR_active_day_C4'] = (df['PAR'] > 4.35).astype(int)   # Threshold for C4 crops, e.g., maize
df['UVA_UVB_ratio'] = df['UVA'] / df['UVB']
df['PAR_fraction'] = df['PAR'] / (df['PAR'] + df['UVA'] + df['UVB'])

# Replace infinite values with NaN
df['UVB_div_gwet_top'] = df['UVB_div_gwet_top'].replace([np.inf, -np.inf], np.nan)
df['UVA_UVB_ratio'] = df['UVA_UVB_ratio'].replace([np.inf, -np.inf], np.nan)


In [12]:
# Extreme events thresholds
dry_threshold = 0.3      # May adjust later according to domain and literature
high_UV_threshold = 2    # WHO high UV index threshold; may be adjusted later

# Binary indicators for extreme days
df['dry_day'] = (df['gwet_top'] < dry_threshold).astype(int)
df['high_UV_day'] = (df['UV_idx'] > high_UV_threshold).astype(int)

In [13]:
# Function to calculate maximum consecutive dry days
def max_consecutive_ones(s):
    max_streak = current_streak = 0
    for val in s:
        if val == 1:
            current_streak += 1
        else:
            if current_streak > max_streak:
                max_streak = current_streak
            current_streak = 0
    return max(max_streak, current_streak)


In [14]:
# Define aggregation functions
def percentile(n):
    def percentile_(x):
        return np.nanpercentile(x, n)
    percentile_.__name__ = 'p%s' % n
    return percentile_

In [15]:
# Ensure data is sorted by date for delta calculations
df = df.sort_values('DATE')

# Group by season and aggregate
summary = df.groupby('season').agg(
    # Original aggregations
    PAR_total=('PAR', 'sum'),
    PAR_mean=('PAR', 'mean'),
    PAR_std=('PAR', 'std'),
    UVA_total=('UVA', 'sum'),
    UVA_mean=('UVA', 'mean'),
    UVA_std=('UVA', 'std'),
    UVB_total=('UVB', 'sum'),
    UVB_mean=('UVB', 'mean'),
    UVB_std=('UVB', 'std'),
    UV_idx_mean=('UV_idx', 'mean'),
    UV_idx_median=('UV_idx', 'median'),
    UV_idx_std=('UV_idx', 'std'),
    gwet_top_mean=('gwet_top', 'mean'),
    gwet_top_median=('gwet_top', 'median'),
    gwet_top_std=('gwet_top', 'std'),
    gwet_top_95p=('gwet_top', percentile(95)),
    gwet_top_5p=('gwet_top', percentile(5)),
    gwet_root_mean=('gwet_root', 'mean'),
    gwet_root_median=('gwet_root', 'median'),
    gwet_root_std=('gwet_root', 'std'),
    gwet_root_95p=('gwet_root', percentile(95)),
    gwet_root_5p=('gwet_root', percentile(5)),
    gwet_prof_mean=('gwet_prof', 'mean'),
    gwet_prof_median=('gwet_prof', 'median'),
    gwet_prof_std=('gwet_prof', 'std'),
    gwet_prof_95p=('gwet_prof', percentile(95)),
    gwet_prof_5p=('gwet_prof', percentile(5)),
    
    # New feature aggregations
    # Interaction metrics
    PAR_x_gwet_root_mean=('PAR_x_gwet_root', 'mean'),
    PAR_x_gwet_root_std=('PAR_x_gwet_root', 'std'),
    UVB_div_gwet_top_mean=('UVB_div_gwet_top', 'mean'),
    UVB_div_gwet_top_std=('UVB_div_gwet_top', 'std'),
    
    # Extreme events
    dry_days_count=('dry_day', 'sum'),
    max_consecutive_dry_days=('dry_day', max_consecutive_ones),
    high_UV_days_count=('high_UV_day', 'sum'),
    total_days=('DATE', 'count'),
    
    # Soil moisture metrics
    gwet_gradient_mean=('gwet_gradient', 'mean'),
    gwet_gradient_std=('gwet_gradient', 'std'),
    
    # Radiation metrics
    C3_PAR_active_days_count=('PAR_active_day_C3', 'sum'),
    C4_PAR_active_days_count=('PAR_active_day_C4', 'sum'),
    UVA_UVB_ratio_mean=('UVA_UVB_ratio', 'mean'),
    UVA_UVB_ratio_median=('UVA_UVB_ratio', 'median'),
    UVA_UVB_ratio_std=('UVA_UVB_ratio', 'std'),
    PAR_fraction_mean=('PAR_fraction', 'mean'),
    PAR_fraction_median=('PAR_fraction', 'median'),
    PAR_fraction_std=('PAR_fraction', 'std')
).reset_index()

summary['dry_days_pct'] = summary['dry_days_count'] / summary['total_days']
summary['high_UV_days_pct'] = summary['high_UV_days_count'] / summary['total_days']
summary['C3_PAR_active_days_pct'] = summary['C3_PAR_active_days_count'] / summary['total_days']
summary['C4_PAR_active_days_pct'] = summary['C4_PAR_active_days_count'] / summary['total_days']
summary = summary.drop(columns=['dry_days_count', 'high_UV_days_count', 
                                'C3_PAR_active_days_count', 'C4_PAR_active_days_count', 'total_days'])

In [16]:
# Display the enhanced summary
pd.set_option('display.max_columns', None)
summary
pd.reset_option('display.max_columns')

Unnamed: 0,season,PAR_total,PAR_mean,PAR_std,UVA_total,UVA_mean,UVA_std,UVB_total,UVB_mean,UVB_std,UV_idx_mean,UV_idx_median,UV_idx_std,gwet_top_mean,gwet_top_median,gwet_top_std,gwet_top_95p,gwet_top_5p,gwet_root_mean,gwet_root_median,gwet_root_std,gwet_root_95p,gwet_root_5p,gwet_prof_mean,gwet_prof_median,gwet_prof_std,gwet_prof_95p,gwet_prof_5p,PAR_x_gwet_root_mean,PAR_x_gwet_root_std,UVB_div_gwet_top_mean,UVB_div_gwet_top_std,max_consecutive_dry_days,gwet_gradient_mean,gwet_gradient_std,UVA_UVB_ratio_mean,UVA_UVB_ratio_median,UVA_UVB_ratio_std,PAR_fraction_mean,PAR_fraction_median,PAR_fraction_std,dry_days_pct,high_UV_days_pct,C3_PAR_active_days_pct,C4_PAR_active_days_pct
0,2021S1,510.44,5.317083,1.710719,58.78,0.612292,0.22311,1.25,0.013021,0.005642,0.753854,0.765,0.30536,0.764583,0.78,0.10852,0.9225,0.5975,0.759479,0.745,0.124091,0.9525,0.5875,0.722708,0.71,0.11788,0.9225,0.56,4.135831,1.774591,0.016801,0.006178,0,0.005104,0.023306,47.78022,44.0,12.499824,0.896137,0.896084,0.009046,0.0,0.0,0.833333,0.677083
1,2021S2,687.11,7.634556,1.59652,79.62,0.884667,0.213448,1.94,0.021556,0.00792,1.259222,1.23,0.403449,0.418444,0.4,0.06744,0.5455,0.3345,0.442222,0.43,0.057588,0.55,0.37,0.431889,0.42,0.052164,0.53,0.37,3.319234,0.5577,0.054549,0.024667,0,-0.023778,0.023539,44.05,41.333333,10.108531,0.894588,0.893884,0.004444,0.0,0.033333,1.0,0.988889
2,2021S3,729.15,7.51701,2.092512,93.08,0.959588,0.268308,2.64,0.027216,0.008384,1.580309,1.66,0.476319,0.795567,0.88,0.154785,0.93,0.414,0.792784,0.89,0.172341,0.962,0.42,0.753093,0.84,0.161525,0.93,0.418,5.838834,1.901438,0.036447,0.015493,0,0.002784,0.039784,35.71875,35.0,4.640044,0.883828,0.883746,0.007021,0.0,0.185567,0.948454,0.886598
3,2021S4,608.16,7.416585,1.725666,79.93,0.974756,0.218017,2.22,0.027073,0.006937,1.591707,1.61,0.378417,0.916829,0.92,0.017908,0.94,0.8805,0.950122,0.96,0.020215,0.98,0.91,0.908293,0.91,0.024885,0.9495,0.86,7.034694,1.596614,0.029571,0.007686,0,-0.033293,0.006489,36.660569,36.0,5.131247,0.880706,0.879931,0.004415,0.0,0.170732,0.987805,0.95122
4,2022S1,555.76,5.789167,1.246466,65.26,0.679792,0.144594,1.29,0.013437,0.004775,0.830625,0.79,0.220348,0.736458,0.74,0.102526,0.9,0.5975,0.730521,0.715,0.113364,0.9325,0.59,0.695417,0.68,0.105421,0.8925,0.5675,4.313649,1.443694,0.017964,0.004696,0,0.005938,0.018557,53.677083,52.0,12.334372,0.892767,0.89335,0.004488,0.0,0.0,0.947917,0.875
5,2022S2,664.65,7.385,1.738253,77.87,0.865222,0.22072,1.85,0.020556,0.006928,1.196444,1.23,0.387041,0.483111,0.475,0.072772,0.5955,0.3745,0.494778,0.49,0.056553,0.58,0.41,0.479556,0.48,0.051949,0.56,0.4,3.615152,0.827837,0.044672,0.018665,0,-0.011667,0.024276,44.068519,41.75,9.638662,0.8933,0.892334,0.00396,0.0,0.0,0.966667,0.955556
6,2022S3,736.19,7.589588,2.446593,94.13,0.970412,0.319315,2.6,0.026804,0.009953,1.563918,1.58,0.572157,0.805979,0.85,0.1426,0.942,0.488,0.806907,0.84,0.154072,0.97,0.5,0.767216,0.79,0.147477,0.942,0.49,6.095657,2.322216,0.034707,0.014792,0,-0.000928,0.033388,37.164062,36.333333,5.711298,0.883787,0.882527,0.006784,0.0,0.237113,0.927835,0.85567
7,2022S4,629.8,7.680488,1.90093,83.01,1.012317,0.237634,2.35,0.028659,0.008129,1.671829,1.605,0.434915,0.874878,0.88,0.033194,0.92,0.82,0.888659,0.89,0.051009,0.95,0.8005,0.840732,0.84,0.05173,0.91,0.76,6.805579,1.636169,0.032957,0.009989,0,-0.01378,0.020406,36.271341,35.166667,5.539186,0.880053,0.880004,0.004494,0.0,0.256098,0.963415,0.939024


In [17]:
# Label district information for the summary set
summary.insert(0, 'location', 'Muktagacha')

In [18]:
# Check insert result
summary.head()

Unnamed: 0,location,season,PAR_total,PAR_mean,PAR_std,UVA_total,UVA_mean,UVA_std,UVB_total,UVB_mean,...,UVA_UVB_ratio_mean,UVA_UVB_ratio_median,UVA_UVB_ratio_std,PAR_fraction_mean,PAR_fraction_median,PAR_fraction_std,dry_days_pct,high_UV_days_pct,C3_PAR_active_days_pct,C4_PAR_active_days_pct
0,Muktagacha,2021S1,510.44,5.317083,1.710719,58.78,0.612292,0.22311,1.25,0.013021,...,47.78022,44.0,12.499824,0.896137,0.896084,0.009046,0.0,0.0,0.833333,0.677083
1,Muktagacha,2021S2,687.11,7.634556,1.59652,79.62,0.884667,0.213448,1.94,0.021556,...,44.05,41.333333,10.108531,0.894588,0.893884,0.004444,0.0,0.033333,1.0,0.988889
2,Muktagacha,2021S3,729.15,7.51701,2.092512,93.08,0.959588,0.268308,2.64,0.027216,...,35.71875,35.0,4.640044,0.883828,0.883746,0.007021,0.0,0.185567,0.948454,0.886598
3,Muktagacha,2021S4,608.16,7.416585,1.725666,79.93,0.974756,0.218017,2.22,0.027073,...,36.660569,36.0,5.131247,0.880706,0.879931,0.004415,0.0,0.170732,0.987805,0.95122
4,Muktagacha,2022S1,555.76,5.789167,1.246466,65.26,0.679792,0.144594,1.29,0.013437,...,53.677083,52.0,12.334372,0.892767,0.89335,0.004488,0.0,0.0,0.947917,0.875


In [19]:
# Export processed data frame to CSV
summary.to_csv('NASA_summary_muktagacha.csv', index=False)

In [20]:
# Generate a data dictionary
def create_data_dictionary(df, filename="NASA_data_data_dictionary.xlsx"):
    """
    Generate and export detailed data dictionary to Excel
    
    Parameters:
    df: Input DataFrame
    filename: Output Excel filename
    """
    # Create base dictionary
    data_dict = pd.DataFrame({
        'Variable Name': df.columns,
        'Data Type': df.dtypes.values,
        'Non-Null Count': df.count().values,
        'Description': [''] * len(df.columns),  # Placeholder for explanations
        'Source': ['NASA'] * len(df.columns)
    })
    
    # Add metadata
    data_dict['Unique Values'] = df.nunique().values
    
    # Initialize additional columns
    data_dict['Min Value'] = np.nan
    data_dict['Max Value'] = np.nan
    data_dict['Range'] = ''
    data_dict['Sample Values'] = ''
    
    # Process columns
    for idx, col in enumerate(df.columns):
        if pd.api.types.is_numeric_dtype(df[col]):
            min_val = df[col].min()
            max_val = df[col].max()
            data_dict.at[idx, 'Min Value'] = min_val
            data_dict.at[idx, 'Max Value'] = max_val
            data_dict.at[idx, 'Range'] = f"{min_val:.4f} to {max_val:.4f}"
        else:
            unique_vals = df[col].unique()
            sample = ', '.join(map(str, unique_vals[:min(3, len(unique_vals))]))
            if len(unique_vals) > 3:
                sample += ', ...'
            data_dict.at[idx, 'Sample Values'] = sample
    
    # Add automatic categorization
    data_dict['Category'] = '—'
    categories = {
        'radiation': ['PAR', 'UV'],
        'gwet': ['gwet'],
        'temperature': ['temp'],
        'gdd': ['gdd'],
        'thi': ['thi'],
        'humidity': ['rh', 'humidity'],
        'vpd': ['vpd'],
        'rainfall': ['rain', 'dry_days', 'wet_days'],
        'wind': ['wind', 'calm', '/']
    }
    
    for idx, col_name in enumerate(df.columns):
        col_lower = col_name.lower()
        for category, keywords in categories.items():
            if any(kw.lower() in col_lower for kw in keywords):
                data_dict.at[idx, 'Category'] = category
                break
    
    # Reorder columns
    column_order = [
        'Variable Name', 'Category', 'Description', 'Data Type', 
        'Non-Null Count', 'Source', 'Unique Values', 
        'Min Value', 'Max Value', 'Range', 'Sample Values'
    ]
    data_dict = data_dict[column_order]
    
    # Export to Excel
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        data_dict.to_excel(writer, index=False, sheet_name='Data Dictionary')
        
        # Add category legend to second sheet
        legend = pd.DataFrame({
            'Category': list(categories.keys()),
            'Keywords': [', '.join(kws) for kws in categories.values()]
        })
        legend.to_excel(writer, index=False, sheet_name='Category Legend')
    
    print(f"Data dictionary exported to: {filename}")
    return data_dict

# Usage
create_data_dictionary(df)

Data dictionary exported to: NASA_data_data_dictionary.xlsx


Unnamed: 0,Variable Name,Category,Description,Data Type,Non-Null Count,Source,Unique Values,Min Value,Max Value,Range,Sample Values
0,YEAR,—,,int64,730,NASA,3,2020.0,2022.0,2020.0000 to 2022.0000,
1,DOY,—,,int64,730,NASA,366,1.0,366.0,1.0000 to 366.0000,
2,PAR,radiation,,float64,730,NASA,460,0.97,12.04,0.9700 to 12.0400,
3,UVA,radiation,,float64,730,NASA,125,0.14,1.58,0.1400 to 1.5800,
4,UVB,radiation,,float64,730,NASA,6,0.0,0.05,0.0000 to 0.0500,
5,UV_idx,radiation,,float64,730,NASA,213,0.19,2.78,0.1900 to 2.7800,
6,gwet_top,gwet,,float64,730,NASA,63,0.33,0.95,0.3300 to 0.9500,
7,gwet_root,gwet,,float64,730,NASA,63,0.37,0.99,0.3700 to 0.9900,
8,gwet_prof,gwet,,float64,730,NASA,60,0.37,0.96,0.3700 to 0.9600,
9,DATE,—,,datetime64[ns],730,NASA,730,,,,"2020-10-22 00:00:00, 2020-10-23 00:00:00, 2020..."
