In [None]:
# ============================================
# üìä DATA EXPLORATION
# Climate Forecast Dataset - Initial Analysis
# ============================================

import pandas as pd
import numpy as np
import os

print("=" * 60)
print("üåç CLIMATE FORECAST DATASET - EXPLORATION")
print("=" * 60)

# ============================================
# Load Dataset
# ============================================
print("\nüìÇ LOADING DATA")
print("-" * 60)

# Try to load from data directory first, otherwise download from Kaggle
data_dir = '/home/jovyan/data'
files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]

if files:
    print(f"CSV files found in {data_dir}:")
    for f in files:
        file_path = os.path.join(data_dir, f)
        size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"  ‚úÖ {f} ({size_mb:.2f} MB)")
    
    csv_file = files[0]
    df = pd.read_csv(f'{data_dir}/{csv_file}')
    print(f"\n‚úÖ Loaded: {csv_file} from local directory")
else:
    print("No CSV files found in data directory. Downloading from Kaggle...")
    try:
        import kagglehub
        
        path = kagglehub.dataset_download("tarunrm09/climate-change-indicators")
        print(f"‚úÖ Dataset downloaded to: {path}")
        
        # Find the CSV file
        csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
        if not csv_files:
            raise FileNotFoundError("No CSV files found in downloaded dataset")
        
        csv_file = csv_files[0]
        csv_path = os.path.join(path, csv_file)
        df = pd.read_csv(csv_path)
        print(f"‚úÖ Loaded: {csv_file} from Kaggle")
        
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to download/load data: {e}")
        print("Creating sample data for demonstration...")
        
        # Generate sample data
        np.random.seed(42)
        countries = ['Brazil', 'USA', 'China', 'India', 'Germany']
        years = range(1960, 2024)
        data = []
        for country in countries:
            for year in years:
                data.append({
                    'Country': country,
                    'Year': year,
                    'Temperature Change': np.random.normal(0.5, 0.2)
                })
        df = pd.DataFrame(data)
        csv_file = 'sample_climate_data.csv'
        print(f"‚úÖ Generated sample data: {csv_file}")

print(f"   Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")

# ============================================
# Dataset Structure
# ============================================
print("\nüìã DATASET STRUCTURE")
print("-" * 60)
print("\nColumn names and types:")
print(df.dtypes)

print("\n" + "=" * 60)
print("üîç FIRST 10 ROWS")
print("=" * 60)
print(df.head(10))

print("\n" + "=" * 60)
print("üìä BASIC STATISTICS")
print("=" * 60)
print(df.describe())

print("\n" + "=" * 60)
print("‚ùì MISSING VALUES")
print("=" * 60)
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
}).sort_values('Missing Count', ascending=False)

print(missing_df[missing_df['Missing Count'] > 0])

print("\n" + "=" * 60)
print("üó∫Ô∏è  UNIQUE VALUES (categorical columns)")
print("=" * 60)

# Check for categorical columns
for col in df.columns:
    if df[col].dtype == 'object':
        unique_count = df[col].nunique()
        print(f"\n{col}:")
        print(f"  Unique values: {unique_count}")
        if unique_count <= 20:
            print(f"  Values: {df[col].unique()[:10]}")
        else:
            print(f"  Sample values: {df[col].unique()[:5]}")

print("\n" + "=" * 60)
print("üìÖ DATE RANGE")
print("=" * 60)

# Try to identify date columns
date_cols = [col for col in df.columns if 'year' in col.lower() or 'date' in col.lower()]
if date_cols:
    for col in date_cols:
        print(f"\n{col}:")
        print(f"  Min: {df[col].min()}")
        print(f"  Max: {df[col].max()}")
else:
    print("No obvious date columns found")
    print("\nAll columns:", df.columns.tolist())

# ============================================
# Save Metadata
# ============================================
print("\nüíæ SAVING METADATA")
print("-" * 60)

# Create data dictionary
metadata = {
    'filename': csv_file,
    'rows': df.shape[0],
    'columns': df.shape[1],
    'column_names': df.columns.tolist(),
    'column_types': df.dtypes.astype(str).to_dict(),
    'missing_values': missing.to_dict()
}

# Save as JSON
import json
metadata_path = '/home/jovyan/reports/dataset_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"‚úÖ Metadata saved to: {metadata_path}")

print("\n" + "=" * 60)
print("‚úÖ EXPLORATION COMPLETE")
print("=" * 60)
print("\nüìã NEXT STEPS:")
print("   1. Review column names and types")
print("   2. Design PostgreSQL table schema")
print("   3. Create init.sql script")
print("   4. Load data into database")
print("=" * 60)

üåç CLIMATE FORECAST DATASET - EXPLORATION

üìÇ LOADING DATA
------------------------------------------------------------
CSV files found in /home/jovyan/data:
  ‚úÖ climate_change_indicators.csv (0.18 MB)

‚úÖ Loaded: climate_change_indicators.csv
   Shape: 225 rows √ó 72 columns

üìã DATASET STRUCTURE
------------------------------------------------------------

Column names and types:
ObjectId       int64
Country       object
ISO2          object
ISO3          object
Indicator     object
              ...   
F2018        float64
F2019        float64
F2020        float64
F2021        float64
F2022        float64
Length: 72, dtype: object

üîç FIRST 10 ROWS
   ObjectId                       Country ISO2 ISO3  \
0         1  Afghanistan, Islamic Rep. of   AF  AFG   
1         2                       Albania   AL  ALB   
2         3                       Algeria   DZ  DZA   
3         4                American Samoa   AS  ASM   
4         5      Andorra, Principality of   AD  AND   