In [4]:
import pandas as pd
import numpy as np
from typing import Dict, Any

In [9]:
class DataCollector:
    def __init__(self, temperature_source: str = None, co2_source: str = None):
        """
        Initialize data collector with optional custom data sources
        
        Args:
            temperature_source (str): Path to temperature data
            co2_source (str): Path to CO2 emissions data
        """
        self.temperature_data = None
        self.co2_data = None
        
        # Default data sources (replace with actual URLs/paths)
        self.default_temp_source = temperature_source or "https://raw.githubusercontent.com/Desmondonam/climate_change/refs/heads/main/data/observed-annual-average.csv"
        self.default_co2_source = co2_source or "https://raw.githubusercontent.com/Desmondonam/climate_change/refs/heads/main/data/annual-co2-emissions-per-country.csv"
    
    def collect_temperature_data(self) -> pd.DataFrame:
        """
        Collect global temperature data
        
        Returns:
            pd.DataFrame: Cleaned temperature dataset
        """
        try:
            self.temperature_data = pd.read_csv(self.default_temp_source)
            
            # Basic data cleaning
            self.temperature_data['Year'] = pd.to_datetime(self.temperature_data['Category'], format='%Y')
            self.temperature_data.dropna(inplace=True)
            
            return self.temperature_data
        except Exception as e:
            print(f"Error collecting temperature data: {e}")
            return pd.DataFrame()
    
    def collect_co2_emissions(self) -> pd.DataFrame:
        """
        Collect CO2 emissions data
        
        Returns:
            pd.DataFrame: Cleaned CO2 emissions dataset
        """
        try:
            self.co2_data = pd.read_csv(self.default_co2_source)
            
            # Basic data cleaning
            self.co2_data['Year'] = pd.to_datetime(self.co2_data['Year'], format='%Y')
            self.co2_data.dropna(inplace=True)
            
            return self.co2_data
        except Exception as e:
            print(f"Error collecting CO2 emissions data: {e}")
            return pd.DataFrame()
    
    def merge_datasets(self) -> pd.DataFrame:
        """
        Merge temperature and CO2 emissions datasets
        
        Returns:
            pd.DataFrame: Combined dataset
        """
        if self.temperature_data is None:
            self.collect_temperature_data()
        
        if self.co2_data is None:
            self.collect_co2_emissions()
        
        merged_data = pd.merge(
            self.temperature_data, 
            self.co2_data, 
            on='Year', 
            how='inner'
        )
        
        return merged_data

In [10]:
collector = DataCollector()
climate_data = collector.merge_datasets()

In [11]:
climate_data

Unnamed: 0,Category,Annual Mean,5-yr smooth,Year,Entity,Code,Annual CO₂ emissions
0,1950,21.98,22.34,1950-01-01,Tanzania,TZA,498304.0
1,1951,22.29,22.36,1951-01-01,Tanzania,TZA,447008.0
2,1952,22.57,22.37,1952-01-01,Tanzania,TZA,538608.0
3,1953,22.61,22.39,1953-01-01,Tanzania,TZA,527616.0
4,1954,22.45,22.40,1954-01-01,Tanzania,TZA,538608.0
...,...,...,...,...,...,...,...
69,2019,23.09,23.02,2019-01-01,Tanzania,TZA,16377151.0
70,2020,23.00,23.02,2020-01-01,Tanzania,TZA,16250422.0
71,2021,22.94,23.03,2021-01-01,Tanzania,TZA,18266576.0
72,2022,22.86,23.03,2022-01-01,Tanzania,TZA,17735866.0


In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

class DataPreprocessor:
    def __init__(self, data: pd.DataFrame):
        """
        Initialize data preprocessor
        
        Args:
            data (pd.DataFrame): Input dataset
        """
        self.original_data = data
        self.preprocessed_data = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
    
    def feature_engineering(self) -> pd.DataFrame:
        """
        Perform feature engineering
        
        Returns:
            pd.DataFrame: Engineered features
        """
        df = self.original_data.copy()
        
        # Create rolling averages
        df['temp_5yr_avg'] = df['Annual Mean'].rolling(window=5).mean()
        df['co2_5yr_avg'] = df['Annual CO₂ emissions'].rolling(window=5).mean()
        
        # Lag features
        df['temp_last_year'] = df['Annual Mean'].shift(1)
        df['co2_last_year'] = df['Annual CO₂ emissions'].shift(1)
        
        return df.dropna()
    
    def scale_features(self, features: list) -> np.ndarray:
        """
        Scale numerical features
        
        Args:
            features (list): List of feature column names
        
        Returns:
            np.ndarray: Scaled feature matrix
        """
        scaler = StandardScaler()
        return scaler.fit_transform(self.preprocessed_data[features])
    
    def prepare_ml_dataset(self, target_column: str, test_size: float = 0.2):
        """
        Prepare dataset for machine learning
        
        Args:
            target_column (str): Column to predict
            test_size (float): Proportion of test dataset
        """
        self.preprocessed_data = self.feature_engineering()
        
        # Select features and target
        features = [
            'Annual Mean', 'Annual CO₂ emissions', 
            'temp_5yr_avg', 'co2_5yr_avg', 
            'temp_last_year', 'co2_last_year'
        ]
        
        X = self.scale_features(features)
        y = self.preprocessed_data[target_column]
        
        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=42
        )

In [19]:
preprocessor = DataPreprocessor(climate_data)

In [21]:
preprocessor = DataPreprocessor(climate_data)
preprocessor.prepare_ml_dataset(target_column='Annual Mean')