# 01_data_processing

This notebook contains comprehensive data ingestion and preprocessing examples for biomass data analysis.

## 1. Import Dependencies and Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from pathlib import Path

# Setup plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 2. Enhanced Data Loader Class

In [None]:
class EnhancedBiomassLoader:
    """Enhanced loader for biomass data from multiple sources."""
    
    def __init__(self, data_dir="data"):
        self.data_dir = Path(data_dir)
        self.data_dir.mkdir(exist_ok=True)
    
    def load_sample(self):
        """Returns a small list representing simplified pixel biomass values."""
        return [1.0, 2.5, 3.2, 0.8, 2.1]
    
    def create_sample_csv(self, filename="sample_biomass.csv"):
        """Create a sample CSV file for demonstration."""
        data = {
            'pixel_id': range(1, 101),
            'biomass_value': np.random.normal(2.5, 1.0, 100).clip(0.1, 5.0),
            'latitude': np.random.uniform(40.0, 45.0, 100),
            'longitude': np.random.uniform(-75.0, -70.0, 100),
            'vegetation_type': np.random.choice(['Forest', 'Grassland', 'Shrubland', 'Wetland'], 100)
        }
        df = pd.DataFrame(data)
        filepath = self.data_dir / filename
        df.to_csv(filepath, index=False)
        return filepath
    
    def create_sample_json(self, filename="sample_biomass.json"):
        """Create a sample JSON file for demonstration."""
        data = {
            "metadata": {
                "dataset_name": "Biomass Sample Data",
                "collection_date": "2024-01-15",
                "units": "kg/mÂ²"
            },
            "biomass_readings": [
                {
                    "site_id": f"S{i:03d}",
                    "biomass": round(np.random.normal(3.0, 1.5), 2),
                    "coordinates": {
                        "lat": round(np.random.uniform(40.0, 45.0), 4),
                        "lon": round(np.random.uniform(-75.0, -70.0), 4)
                    },
                    "quality_flag": np.random.choice(["good", "fair", "poor"], p=[0.8, 0.15, 0.05])
                } for i in range(50)
            ]
        }
        filepath = self.data_dir / filename
        with open(filepath, 'w') as f:
            json.dump(data, f, indent=2)
        return filepath
    
    def load_csv(self, filename):
        """Load biomass data from CSV file."""
        filepath = self.data_dir / filename
        if not filepath.exists():
            raise FileNotFoundError(f"CSV file not found: {filepath}")
        return pd.read_csv(filepath)
    
    def load_json(self, filename):
        """Load biomass data from JSON file."""
        filepath = self.data_dir / filename
        if not filepath.exists():
            raise FileNotFoundError(f"JSON file not found: {filepath}")
        with open(filepath, 'r') as f:
            return json.load(f)
    
    def load_multiple_formats(self, csv_file=None, json_file=None):
        """Load data from multiple file formats and combine."""
        datasets = {}
        
        if csv_file:
            datasets['csv'] = self.load_csv(csv_file)
        
        if json_file:
            json_data = self.load_json(json_file)
            # Convert JSON to DataFrame for easier analysis
            biomass_data = []
            for reading in json_data['biomass_readings']:
                biomass_data.append({
                    'site_id': reading['site_id'],
                    'biomass': reading['biomass'],
                    'latitude': reading['coordinates']['lat'],
                    'longitude': reading['coordinates']['lon'],
                    'quality_flag': reading['quality_flag']
                })
            datasets['json'] = pd.DataFrame(biomass_data)
        
        return datasets

## 3. Initialize Loader and Create Sample Data

In [None]:
# Initialize the enhanced loader
loader = EnhancedBiomassLoader()

# Create sample data files
csv_file = loader.create_sample_csv()
json_file = loader.create_sample_json()

print(f"Sample CSV created: {csv_file}")
print(f"Sample JSON created: {json_file}")

## 4. Data Loading and Basic Exploration

In [None]:
# Load data from different formats
datasets = loader.load_multiple_formats(csv_file="sample_biomass.csv", json_file="sample_biomass.json")

csv_data = datasets['csv']
json_data = datasets['json']

print("=== CSV Data Overview ===")
print(f"Shape: {csv_data.shape}")
print("\nFirst 5 rows:")
print(csv_data.head())

print("\n=== JSON Data Overview ===")
print(f"Shape: {json_data.shape}")
print("\nFirst 5 rows:")
print(json_data.head())

## 5. Comprehensive Data Exploration

In [None]:
def explore_biomass_data(df, dataset_name):
    """Comprehensive exploration of biomass dataset."""
    print(f"\n{'='*50}")
    print(f"EXPLORING {dataset_name.upper()} DATASET")
    print(f"{'='*50}")
    
    # Basic information
    print(f"Dataset Shape: {df.shape}")
    print(f"\nData Types:")
    print(df.dtypes)
    
    # Statistical summary
    print(f"\nStatistical Summary:")
    print(df.describe())
    
    # Check for missing values
    print(f"\nMissing Values:")
    print(df.isnull().sum())
    
    # Biomass-specific analysis
    biomass_col = 'biomass_value' if 'biomass_value' in df.columns else 'biomass'
    if biomass_col in df.columns:
        biomass_data = df[biomass_col]
        print(f"\nBiomass-Specific Analysis:")
        print(f"Sample size: {len(biomass_data)}")
        print(f"Data range: {biomass_data.min():.2f} - {biomass_data.max():.2f}")
        print(f"Mean biomass: {biomass_data.mean():.2f}")
        print(f"Median biomass: {biomass_data.median():.2f}")
        print(f"Standard deviation: {biomass_data.std():.2f}")
        print(f"Coefficient of variation: {(biomass_data.std() / biomass_data.mean() * 100):.2f}%")
    
    # Categorical data analysis (if exists)
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        print(f"\n{col} value counts:")
        print(df[col].value_counts())

# Explore both datasets
explore_biomass_data(csv_data, "CSV")
explore_biomass_data(json_data, "JSON")

## 6. Data Visualization for Biomass Distribution

In [None]:
def visualize_biomass_distribution(csv_df, json_df):
    """Create comprehensive visualizations for biomass data."""
    
    # Set up the plotting figure
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('Biomass Data Distribution Analysis', fontsize=16, fontweight='bold')
    
    # Extract biomass columns
    csv_biomass = csv_df['biomass_value']
    json_biomass = json_df['biomass']
    
    # 1. Histograms
    axes[0, 0].hist(csv_biomass, bins=15, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 0].set_title('CSV Data - Biomass Distribution')
    axes[0, 0].set_xlabel('Biomass Value')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].grid(True, alpha=0.3)
    
    axes[0, 1].hist(json_biomass, bins=15, alpha=0.7, color='lightcoral', edgecolor='black')
    axes[0, 1].set_title('JSON Data - Biomass Distribution')
    axes[0, 1].set_xlabel('Biomass Value')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].grid(True, alpha=0.3)
    
    # 2. Box plots
    biomass_data = [csv_biomass, json_biomass]
    axes[0, 2].boxplot(biomass_data, labels=['CSV', 'JSON'])
    axes[0, 2].set_title('Biomass Distribution Comparison')
    axes[0, 2].set_ylabel('Biomass Value')
    axes[0, 2].grid(True, alpha=0.3)
    
    # 3. Violin plots
    sns.violinplot(data=biomass_data, ax=axes[1, 0])
    axes[1, 0].set_xticklabels(['CSV', 'JSON'])
    axes[1, 0].set_title('Biomass Density Distribution')
    axes[1, 0].set_ylabel('Biomass Value')
    
    # 4. Cumulative distribution function
    sorted_csv = np.sort(csv_biomass)
    sorted_json = np.sort(json_biomass)
    
    axes[1, 1].plot(sorted_csv, np.arange(len(sorted_csv)) / len(sorted_csv), 
                   label='CSV Data', linewidth=2)
    axes[1, 1].plot(sorted_json, np.arange(len(sorted_json)) / len(sorted_json), 
                   label='JSON Data', linewidth=2)
    axes[1, 1].set_title('Cumulative Distribution Function')
    axes[1, 1].set_xlabel('Biomass Value')
    axes[1, 1].set_ylabel('Cumulative Probability')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    
    # 5. Categorical analysis (for CSV data)
    if 'vegetation_type' in csv_df.columns:
        vegetation_means = csv_df.groupby('vegetation_type')['biomass_value'].mean()
        axes[1, 2].bar(vegetation_means.index, vegetation_means.values, 
                     color=['lightgreen', 'lightblue', 'lightyellow', 'lightpink'])
        axes[1, 2].set_title('Average Biomass by Vegetation Type')
        axes[1, 2].set_xlabel('Vegetation Type')
        axes[1, 2].set_ylabel('Average Biomass')
        axes[1, 2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Additional visualization: Quality flag analysis for JSON data
    if 'quality_flag' in json_df.columns:
        plt.figure(figsize=(10, 6))
        quality_stats = json_df.groupby('quality_flag')['biomass'].agg(['mean', 'std', 'count'])
        
        plt.subplot(1, 2, 1)
        json_df['quality_flag'].value_counts().plot(kind='pie', autopct='%1.1f%%', 
                                                 colors=['lightgreen', 'lightyellow', 'lightcoral'])
        plt.title('Data Quality Distribution')
        
        plt.subplot(1, 2, 2)
        sns.boxplot(data=json_df, x='quality_flag', y='biomass')
        plt.title('Biomass Distribution by Quality Flag')
        plt.xticks(rotation=45)
        
        plt.tight_layout()
        plt.show()

# Generate visualizations
visualize_biomass_distribution(csv_data, json_data)

## 7. Data Quality and Preprocessing

In [None]:
def preprocess_biomass_data(df, biomass_column):
    """Perform data preprocessing and quality checks."""
    
    print("=== DATA PREPROCESSING ===")
    
    # Create a copy to avoid modifying original data
    processed_df = df.copy()
    
    # 1. Handle missing values
    missing_before = processed_df[biomass_column].isnull().sum()
    if missing_before > 0:
        print(f"Found {missing_before} missing values in {biomass_column}")
        # Fill with median (you can choose other strategies)
        processed_df[biomass_column].fillna(processed_df[biomass_column].median(), inplace=True)
        print(f"Missing values filled with median")
    
    # 2. Remove outliers using IQR method
    Q1 = processed_df[biomass_column].quantile(0.25)
    Q3 = processed_df[biomass_column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers_before = len(processed_df)
    processed_df = processed_df[(processed_df[biomass_column] >= lower_bound) & 
                              (processed_df[biomass_column] <= upper_bound)]
    outliers_removed = outliers_before - len(processed_df)
    
    print(f"Removed {outliers_removed} outliers using IQR method")
    print(f"Data range after preprocessing: {processed_df[biomass_column].min():.2f} - {processed_df[biomass_column].max():.2f}")
    
    # 3. Add derived features
    processed_df['biomass_category'] = pd.cut(processed_df[biomass_column], 
                                            bins=[0, 1, 2, 3, 5], 
                                            labels=['Very Low', 'Low', 'Medium', 'High'])
    
    processed_df['log_biomass'] = np.log(processed_df[biomass_column] + 0.1)  # Add small constant to avoid log(0)
    
    print(f"\nAdded derived features:")
    print(f"- biomass_category: Categorical classification")
    print(f"- log_biomass: Log-transformed values for normalization")
    
    return processed_df

# Preprocess both datasets
print("Processing CSV data:")
processed_csv = preprocess_biomass_data(csv_data, 'biomass_value')

print("\n" + "="*50 + "\n")

print("Processing JSON data:")
processed_json = preprocess_biomass_data(json_data, 'biomass')

print("\n" + "="*50)
print("PREPROCESSING COMPLETED SUCCESSFULLY!")

## 8. Summary and Export Processed Data

In [None]:
# Display final summary
print("=== FINAL SUMMARY ===")
print(f"Original CSV data shape: {csv_data.shape}")
print(f"Processed CSV data shape: {processed_csv.shape}")
print(f"Original JSON data shape: {json_data.shape}")
print(f"Processed JSON data shape: {processed_json.shape}")

# Export processed data
processed_csv.to_csv('data/processed_biomass_csv.csv', index=False)
processed_json.to_csv('data/processed_biomass_json.csv', index=False)

print("\nProcessed data exported to:")
print("- data/processed_biomass_csv.csv")
print("- data/processed_biomass_json.csv")

print("\n" + "="*50)
print("DATA PROCESSING PIPELINE COMPLETED!")