In [1]:
# To display full output in Notebook, instead of only the last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd

# Preprocessing

In [4]:
# Load and view data frame
df = pd.read_csv('NASA_data_cumilla.csv')
df.head(5)

Unnamed: 0,YEAR,DOY,ALLSKY_SFC_PAR_TOT,ALLSKY_SFC_UVA,ALLSKY_SFC_UVB,ALLSKY_SFC_UV_INDEX,GWETTOP,GWETROOT,GWETPROF
0,2020,296,2.32,0.42,0.01,0.62,0.93,0.92,0.91
1,2020,297,1.14,0.21,0.01,0.31,0.97,0.96,0.94
2,2020,298,4.71,0.72,0.02,1.14,0.94,0.97,0.95
3,2020,299,7.02,0.98,0.03,1.58,0.94,0.96,0.95
4,2020,300,8.42,1.13,0.03,1.7,0.93,0.95,0.95


In [5]:
# Rename columns
df.rename(columns={
    'ALLSKY_SFC_PAR_TOT': 'PAR',
    'ALLSKY_SFC_UVA': 'UVA',
    'ALLSKY_SFC_UVB': 'UVB',
    'ALLSKY_SFC_UV_INDEX': 'UV_idx',
    'GWETTOP': 'gwet_top',
    'GWETROOT': 'gwet_root',
    'GWETPROF': 'gwet_prof'
}, inplace=True)

In [6]:
# Check data types and missing values 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   YEAR       730 non-null    int64  
 1   DOY        730 non-null    int64  
 2   PAR        730 non-null    float64
 3   UVA        730 non-null    float64
 4   UVB        730 non-null    float64
 5   UV_idx     730 non-null    float64
 6   gwet_top   730 non-null    float64
 7   gwet_root  730 non-null    float64
 8   gwet_prof  730 non-null    float64
dtypes: float64(7), int64(2)
memory usage: 51.5 KB


In [7]:
# Convert YEAR and DOY to datetime
df['DATE'] = pd.to_datetime(
    df['YEAR'].astype(str) + ' ' + 
    df['DOY'].astype(str), 
    format='%Y %j'
)

# Show result
print(df[['YEAR', 'DOY', 'DATE']].head())
df.info()

   YEAR  DOY       DATE
0  2020  296 2020-10-22
1  2020  297 2020-10-23
2  2020  298 2020-10-24
3  2020  299 2020-10-25
4  2020  300 2020-10-26
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   YEAR       730 non-null    int64         
 1   DOY        730 non-null    int64         
 2   PAR        730 non-null    float64       
 3   UVA        730 non-null    float64       
 4   UVB        730 non-null    float64       
 5   UV_idx     730 non-null    float64       
 6   gwet_top   730 non-null    float64       
 7   gwet_root  730 non-null    float64       
 8   gwet_prof  730 non-null    float64       
 9   DATE       730 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(7), int64(2)
memory usage: 57.2 KB


In [8]:
# Create season variable
def assign_season(date: pd.Timestamp) -> str:
    """
    Map a date to a custom seasonal label such as '2021S1', '2021S2', etc.

    Season boundaries (inclusive):
        S1: 22 Oct (previous year) – 25 Jan (current year)
        S2: 26 Jan – 25 Apr
        S3: 26 Apr – 31 Jul
        S4: 01 Aug – 21 Oct

    Notes
    -----
    * 22 Oct belongs to S1 of the *next* calendar year (e.g. 2020-10-22 → 2021S1).
    * 21 Oct belongs to S4 of the *same* calendar year.
    * Therefore 2022-10-22 will be labelled '2023S1'. 
      Filter it out later if you only need data up to 2022S4.
    """
    m, d, y = date.month, date.day, date.year

    # S1 (cross-year period)
    if (m == 10 and d >= 22) or m in (11, 12):
        return f"{y + 1}S1"
    if m == 1 and d <= 25:
        return f"{y}S1"

    # S2
    if (m == 1 and d >= 26) or m in (2, 3) or (m == 4 and d <= 25):
        return f"{y}S2"

    # S3
    if (m == 4 and d >= 26) or m in (5, 6) or (m == 7 and d <= 31):
        return f"{y}S3"

    # S4
    if m == 8 or m == 9 or (m == 10 and d <= 21):
        return f"{y}S4"

    # Any date outside the defined windows (unlikely) → NaN
    return pd.NA

# Add the new 'season' column to your DataFrame
df["season"] = df["DATE"].apply(assign_season)

In [9]:
# Quick sanity check
print(df[['YEAR', 'DOY', 'DATE', 'season']].head())
print(df["season"].value_counts(dropna=False).sort_index())

   YEAR  DOY       DATE  season
0  2020  296 2020-10-22  2021S1
1  2020  297 2020-10-23  2021S1
2  2020  298 2020-10-24  2021S1
3  2020  299 2020-10-25  2021S1
4  2020  300 2020-10-26  2021S1
season
2021S1    96
2021S2    90
2021S3    97
2021S4    82
2022S1    96
2022S2    90
2022S3    97
2022S4    82
Name: count, dtype: int64


# Feature engineering

In [11]:
# Add new daily calculation columns
df['PAR_x_gwet_root'] = df['PAR'] * df['gwet_root']
df['UVB_div_gwet_top'] = df['UVB'] / df['gwet_top']
df['gwet_gradient'] = df['gwet_top'] - df['gwet_root']
df['PAR_active_day_C3'] = (df['PAR'] > 3.48).astype(int)   # Threshold for C3 crops, e.g., potatoes and rices
df['PAR_active_day_C4'] = (df['PAR'] > 4.35).astype(int)   # Threshold for C4 crops, e.g., maize
df['UVA_UVB_ratio'] = df['UVA'] / df['UVB']
df['PAR_fraction'] = df['PAR'] / (df['PAR'] + df['UVA'] + df['UVB'])

# Replace infinite values with NaN
df['UVB_div_gwet_top'] = df['UVB_div_gwet_top'].replace([np.inf, -np.inf], np.nan)
df['UVA_UVB_ratio'] = df['UVA_UVB_ratio'].replace([np.inf, -np.inf], np.nan)


In [12]:
# Extreme events thresholds
dry_threshold = 0.3      # May adjust later according to domain and literature
high_UV_threshold = 2    # WHO high UV index threshold; may be adjusted later

# Binary indicators for extreme days
df['dry_day'] = (df['gwet_top'] < dry_threshold).astype(int)
df['high_UV_day'] = (df['UV_idx'] > high_UV_threshold).astype(int)

In [13]:
# Function to calculate maximum consecutive dry days
def max_consecutive_ones(s):
    max_streak = current_streak = 0
    for val in s:
        if val == 1:
            current_streak += 1
        else:
            if current_streak > max_streak:
                max_streak = current_streak
            current_streak = 0
    return max(max_streak, current_streak)


In [14]:
# Define aggregation functions
def percentile(n):
    def percentile_(x):
        return np.nanpercentile(x, n)
    percentile_.__name__ = 'p%s' % n
    return percentile_

In [15]:
# Ensure data is sorted by date for delta calculations
df = df.sort_values('DATE')

# Group by season and aggregate
summary = df.groupby('season').agg(
    # Original aggregations
    PAR_total=('PAR', 'sum'),
    PAR_mean=('PAR', 'mean'),
    PAR_std=('PAR', 'std'),
    UVA_total=('UVA', 'sum'),
    UVA_mean=('UVA', 'mean'),
    UVA_std=('UVA', 'std'),
    UVB_total=('UVB', 'sum'),
    UVB_mean=('UVB', 'mean'),
    UVB_std=('UVB', 'std'),
    UV_idx_mean=('UV_idx', 'mean'),
    UV_idx_median=('UV_idx', 'median'),
    UV_idx_std=('UV_idx', 'std'),
    gwet_top_mean=('gwet_top', 'mean'),
    gwet_top_median=('gwet_top', 'median'),
    gwet_top_std=('gwet_top', 'std'),
    gwet_top_95p=('gwet_top', percentile(95)),
    gwet_top_5p=('gwet_top', percentile(5)),
    gwet_root_mean=('gwet_root', 'mean'),
    gwet_root_median=('gwet_root', 'median'),
    gwet_root_std=('gwet_root', 'std'),
    gwet_root_95p=('gwet_root', percentile(95)),
    gwet_root_5p=('gwet_root', percentile(5)),
    gwet_prof_mean=('gwet_prof', 'mean'),
    gwet_prof_median=('gwet_prof', 'median'),
    gwet_prof_std=('gwet_prof', 'std'),
    gwet_prof_95p=('gwet_prof', percentile(95)),
    gwet_prof_5p=('gwet_prof', percentile(5)),
    
    # New feature aggregations
    # Interaction metrics
    PAR_x_gwet_root_mean=('PAR_x_gwet_root', 'mean'),
    PAR_x_gwet_root_std=('PAR_x_gwet_root', 'std'),
    UVB_div_gwet_top_mean=('UVB_div_gwet_top', 'mean'),
    UVB_div_gwet_top_std=('UVB_div_gwet_top', 'std'),
    
    # Extreme events
    dry_days_count=('dry_day', 'sum'),
    max_consecutive_dry_days=('dry_day', max_consecutive_ones),
    high_UV_days_count=('high_UV_day', 'sum'),
    total_days=('DATE', 'count'),
    
    # Soil moisture metrics
    gwet_gradient_mean=('gwet_gradient', 'mean'),
    gwet_gradient_std=('gwet_gradient', 'std'),
    
    # Radiation metrics
    C3_PAR_active_days_count=('PAR_active_day_C3', 'sum'),
    C4_PAR_active_days_count=('PAR_active_day_C4', 'sum'),
    UVA_UVB_ratio_mean=('UVA_UVB_ratio', 'mean'),
    UVA_UVB_ratio_median=('UVA_UVB_ratio', 'median'),
    UVA_UVB_ratio_std=('UVA_UVB_ratio', 'std'),
    PAR_fraction_mean=('PAR_fraction', 'mean'),
    PAR_fraction_median=('PAR_fraction', 'median'),
    PAR_fraction_std=('PAR_fraction', 'std')
).reset_index()

summary['dry_days_pct'] = summary['dry_days_count'] / summary['total_days']
summary['high_UV_days_pct'] = summary['high_UV_days_count'] / summary['total_days']
summary['C3_PAR_active_days_pct'] = summary['C3_PAR_active_days_count'] / summary['total_days']
summary['C4_PAR_active_days_pct'] = summary['C4_PAR_active_days_count'] / summary['total_days']
summary = summary.drop(columns=['dry_days_count', 'high_UV_days_count', 
                                'C3_PAR_active_days_count', 'C4_PAR_active_days_count', 'total_days'])

In [16]:
# Display the enhanced summary
pd.set_option('display.max_columns', None)
summary
pd.reset_option('display.max_columns')

Unnamed: 0,season,PAR_total,PAR_mean,PAR_std,UVA_total,UVA_mean,UVA_std,UVB_total,UVB_mean,UVB_std,UV_idx_mean,UV_idx_median,UV_idx_std,gwet_top_mean,gwet_top_median,gwet_top_std,gwet_top_95p,gwet_top_5p,gwet_root_mean,gwet_root_median,gwet_root_std,gwet_root_95p,gwet_root_5p,gwet_prof_mean,gwet_prof_median,gwet_prof_std,gwet_prof_95p,gwet_prof_5p,PAR_x_gwet_root_mean,PAR_x_gwet_root_std,UVB_div_gwet_top_mean,UVB_div_gwet_top_std,max_consecutive_dry_days,gwet_gradient_mean,gwet_gradient_std,UVA_UVB_ratio_mean,UVA_UVB_ratio_median,UVA_UVB_ratio_std,PAR_fraction_mean,PAR_fraction_median,PAR_fraction_std,dry_days_pct,high_UV_days_pct,C3_PAR_active_days_pct,C4_PAR_active_days_pct
0,2021S1,649.22,6.762708,1.540493,86.12,0.897083,0.218954,1.94,0.020208,0.006152,1.191458,1.21,0.329606,0.825521,0.82,0.070753,0.9325,0.72,0.809167,0.8,0.093962,0.96,0.67,0.806979,0.8,0.092191,0.9425,0.67,5.508978,1.58912,0.024351,0.00664,0,0.016354,0.024414,45.800347,47.0,8.989553,0.880423,0.880881,0.009936,0.0,0.0,0.958333,0.947917
1,2021S2,782.12,8.690222,1.377415,99.55,1.106111,0.183993,2.53,0.028111,0.006516,1.662667,1.67,0.396419,0.497111,0.49,0.10425,0.68,0.37,0.507889,0.5,0.072739,0.6355,0.42,0.512,0.505,0.07188,0.64,0.4245,4.351663,0.570614,0.060559,0.023205,0,-0.010778,0.035703,40.363333,39.5,6.397432,0.8846,0.884462,0.00467,0.0,0.211111,1.0,1.0
2,2021S3,803.63,8.284845,2.079833,107.52,1.108454,0.277989,3.06,0.031546,0.008458,1.858969,2.0,0.504836,0.709381,0.77,0.178908,0.91,0.37,0.677732,0.71,0.169537,0.892,0.41,0.666082,0.7,0.166955,0.88,0.42,5.520645,1.786025,0.048607,0.021123,0,0.031649,0.047777,35.475773,35.0,4.160313,0.878879,0.877748,0.007155,0.0,0.484536,0.969072,0.948454
3,2021S4,677.79,8.265732,1.28178,95.38,1.163171,0.161152,2.78,0.033902,0.005612,1.962683,2.02,0.28132,0.911829,0.91,0.010199,0.93,0.9,0.922805,0.92,0.010456,0.94,0.91,0.91561,0.92,0.00904,0.93,0.9,7.623816,1.163924,0.037198,0.006238,0,-0.010976,0.006958,34.635163,34.0,3.762055,0.873123,0.874807,0.005174,0.0,0.512195,1.0,1.0
4,2022S1,647.0,6.739583,1.566806,87.68,0.913333,0.192221,1.99,0.020729,0.005281,1.208854,1.185,0.2962,0.819167,0.82,0.043569,0.9,0.75,0.797917,0.8,0.057782,0.9025,0.7075,0.795625,0.79,0.058756,0.9025,0.7,5.402579,1.42079,0.025263,0.005966,0,0.02125,0.018079,44.838542,45.5,6.122886,0.876964,0.879301,0.008214,0.0,0.0,0.927083,0.90625
5,2022S2,764.15,8.490556,1.695359,98.18,1.090889,0.236052,2.34,0.026,0.007465,1.583889,1.685,0.422501,0.558222,0.525,0.124066,0.74,0.4,0.552778,0.53,0.091074,0.69,0.44,0.555556,0.54,0.088598,0.69,0.4445,4.628088,0.936619,0.05042,0.02045,0,0.005444,0.038399,43.630556,42.416667,7.923647,0.88402,0.884075,0.004415,0.0,0.122222,0.988889,0.966667
6,2022S3,810.57,8.356392,2.192011,108.25,1.115979,0.295366,3.02,0.031134,0.008765,1.834433,1.91,0.542056,0.777526,0.84,0.150817,0.914,0.43,0.747938,0.82,0.152288,0.912,0.458,0.737216,0.8,0.152867,0.902,0.46,6.226572,2.108108,0.042287,0.017017,0,0.029588,0.044556,36.202749,36.0,4.262643,0.879051,0.87934,0.00649,0.0,0.42268,0.969072,0.969072
7,2022S4,711.13,8.672317,1.360579,100.16,1.221463,0.165359,2.89,0.035244,0.006131,2.084756,2.135,0.309708,0.875976,0.88,0.022491,0.91,0.84,0.867195,0.87,0.030481,0.91,0.8105,0.861829,0.86,0.030108,0.9,0.81,7.513655,1.176957,0.040316,0.007315,0,0.00878,0.014605,35.181911,34.0,4.28703,0.87287,0.874084,0.004921,0.0,0.634146,0.987805,0.987805


In [17]:
# Label district information for the summary set
summary.insert(0, 'location', 'Cumilla')

In [18]:
# Check insert result
summary.head()

Unnamed: 0,location,season,PAR_total,PAR_mean,PAR_std,UVA_total,UVA_mean,UVA_std,UVB_total,UVB_mean,...,UVA_UVB_ratio_mean,UVA_UVB_ratio_median,UVA_UVB_ratio_std,PAR_fraction_mean,PAR_fraction_median,PAR_fraction_std,dry_days_pct,high_UV_days_pct,C3_PAR_active_days_pct,C4_PAR_active_days_pct
0,Cumilla,2021S1,649.22,6.762708,1.540493,86.12,0.897083,0.218954,1.94,0.020208,...,45.800347,47.0,8.989553,0.880423,0.880881,0.009936,0.0,0.0,0.958333,0.947917
1,Cumilla,2021S2,782.12,8.690222,1.377415,99.55,1.106111,0.183993,2.53,0.028111,...,40.363333,39.5,6.397432,0.8846,0.884462,0.00467,0.0,0.211111,1.0,1.0
2,Cumilla,2021S3,803.63,8.284845,2.079833,107.52,1.108454,0.277989,3.06,0.031546,...,35.475773,35.0,4.160313,0.878879,0.877748,0.007155,0.0,0.484536,0.969072,0.948454
3,Cumilla,2021S4,677.79,8.265732,1.28178,95.38,1.163171,0.161152,2.78,0.033902,...,34.635163,34.0,3.762055,0.873123,0.874807,0.005174,0.0,0.512195,1.0,1.0
4,Cumilla,2022S1,647.0,6.739583,1.566806,87.68,0.913333,0.192221,1.99,0.020729,...,44.838542,45.5,6.122886,0.876964,0.879301,0.008214,0.0,0.0,0.927083,0.90625


In [19]:
# Export processed data frame to CSV
summary.to_csv('NASA_summary_cumilla.csv', index=False)

In [20]:
# Generate a data dictionary
def create_data_dictionary(df, filename="NASA_data_data_dictionary.xlsx"):
    """
    Generate and export detailed data dictionary to Excel
    
    Parameters:
    df: Input DataFrame
    filename: Output Excel filename
    """
    # Create base dictionary
    data_dict = pd.DataFrame({
        'Variable Name': df.columns,
        'Data Type': df.dtypes.values,
        'Non-Null Count': df.count().values,
        'Description': [''] * len(df.columns),  # Placeholder for explanations
        'Source': ['NASA'] * len(df.columns)
    })
    
    # Add metadata
    data_dict['Unique Values'] = df.nunique().values
    
    # Initialize additional columns
    data_dict['Min Value'] = np.nan
    data_dict['Max Value'] = np.nan
    data_dict['Range'] = ''
    data_dict['Sample Values'] = ''
    
    # Process columns
    for idx, col in enumerate(df.columns):
        if pd.api.types.is_numeric_dtype(df[col]):
            min_val = df[col].min()
            max_val = df[col].max()
            data_dict.at[idx, 'Min Value'] = min_val
            data_dict.at[idx, 'Max Value'] = max_val
            data_dict.at[idx, 'Range'] = f"{min_val:.4f} to {max_val:.4f}"
        else:
            unique_vals = df[col].unique()
            sample = ', '.join(map(str, unique_vals[:min(3, len(unique_vals))]))
            if len(unique_vals) > 3:
                sample += ', ...'
            data_dict.at[idx, 'Sample Values'] = sample
    
    # Add automatic categorization
    data_dict['Category'] = '—'
    categories = {
        'radiation': ['PAR', 'UV'],
        'gwet': ['gwet'],
        'temperature': ['temp'],
        'gdd': ['gdd'],
        'thi': ['thi'],
        'humidity': ['rh', 'humidity'],
        'vpd': ['vpd'],
        'rainfall': ['rain', 'dry_days', 'wet_days'],
        'wind': ['wind', 'calm', '/']
    }
    
    for idx, col_name in enumerate(df.columns):
        col_lower = col_name.lower()
        for category, keywords in categories.items():
            if any(kw.lower() in col_lower for kw in keywords):
                data_dict.at[idx, 'Category'] = category
                break
    
    # Reorder columns
    column_order = [
        'Variable Name', 'Category', 'Description', 'Data Type', 
        'Non-Null Count', 'Source', 'Unique Values', 
        'Min Value', 'Max Value', 'Range', 'Sample Values'
    ]
    data_dict = data_dict[column_order]
    
    # Export to Excel
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        data_dict.to_excel(writer, index=False, sheet_name='Data Dictionary')
        
        # Add category legend to second sheet
        legend = pd.DataFrame({
            'Category': list(categories.keys()),
            'Keywords': [', '.join(kws) for kws in categories.values()]
        })
        legend.to_excel(writer, index=False, sheet_name='Category Legend')
    
    print(f"Data dictionary exported to: {filename}")
    return data_dict

# Usage
create_data_dictionary(df)

Data dictionary exported to: NASA_data_data_dictionary.xlsx


Unnamed: 0,Variable Name,Category,Description,Data Type,Non-Null Count,Source,Unique Values,Min Value,Max Value,Range,Sample Values
0,YEAR,—,,int64,730,NASA,3,2020.0,2022.0,2020.0000 to 2022.0000,
1,DOY,—,,int64,730,NASA,366,1.0,366.0,1.0000 to 366.0000,
2,PAR,radiation,,float64,730,NASA,437,1.14,12.22,1.1400 to 12.2200,
3,UVA,radiation,,float64,730,NASA,118,0.21,1.62,0.2100 to 1.6200,
4,UVB,radiation,,float64,730,NASA,5,0.01,0.05,0.0100 to 0.0500,
5,UV_idx,radiation,,float64,730,NASA,203,0.29,2.86,0.2900 to 2.8600,
6,gwet_top,gwet,,float64,730,NASA,61,0.36,0.97,0.3600 to 0.9700,
7,gwet_root,gwet,,float64,730,NASA,57,0.41,0.97,0.4100 to 0.9700,
8,gwet_prof,gwet,,float64,730,NASA,55,0.41,0.95,0.4100 to 0.9500,
9,DATE,—,,datetime64[ns],730,NASA,730,,,,"2020-10-22 00:00:00, 2020-10-23 00:00:00, 2020..."
