In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## **Functions**

In [None]:
# Function to load data
def load_data(file_path):
    """Loads a CSV file into a DataFrame."""
    return pd.read_csv(file_path)

In [None]:
# Function to summarize data
def summarize_data(df):
    """Prints dataset shape, info, and summary statistics."""
    print("Shape:", df.shape)
    print("\nInfo:")
    print(df.info())
    print("\nSummary Statistics:")
    print(df.describe())

In [None]:
# Function to check missing values
def check_missing_values(df):
    """Returns missing value counts and percentages."""
    missing_values = df.isnull().sum()
    missing_percent = (missing_values / len(df)) * 100
    missing_df = pd.DataFrame({"Missing Values": missing_values, "Percent": missing_percent})
    return missing_df.sort_values(by="Missing Values", ascending=False)

In [None]:
# Function to build a preprocessing pipeline
def build_pipeline(numeric_features, categorical_features):
    """
    Creates a preprocessing pipeline:
    - Imputation
    - Encoding categorical features
    - Scaling numerical features
    """
    # Numerical transformer
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Categorical transformer
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine transformations using ColumnTransformer
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    return preprocessor

In [None]:
# Function to train a model using the pipeline
def train_model(df, target_column, model_type="random_forest", param_grid=None):
    """
    Trains a machine learning model with preprocessing pipeline.
    """
    # Identify numeric and categorical features
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.drop(target_column)
    categorical_features = df.select_dtypes(include=['object']).columns

    # Build the preprocessing pipeline
    preprocessor = build_pipeline(numeric_features, categorical_features)

    # Define model choices
    models = {
        "random_forest": RandomForestRegressor(random_state=42),
        "gradient_boosting": GradientBoostingRegressor(random_state=42),
        "xgboost": XGBRegressor(random_state=42),
        "linear_regression": LinearRegression(),
    }

    model = models.get(model_type)
    if not model:
        raise ValueError("Unsupported model type")

    # Create a full pipeline with preprocessing and model
    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Split dataset
    X = df.drop(columns=[target_column])
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Hyperparameter tuning (if needed)
    if param_grid:
        grid_search = GridSearchCV(full_pipeline, param_grid, cv=5, scoring="r2", n_jobs=-1)
        grid_search.fit(X_train, y_train)
        model = grid_search.best_estimator_
    else:
        full_pipeline.fit(X_train, y_train)

    # Predict on test set
    y_pred = full_pipeline.predict(X_test)

    # Model evaluation
    metrics = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "R2 Score": r2_score(y_test, y_pred)
    }

    return full_pipeline, metrics

In [None]:
# Example Usage (After Uploading the Dataset)
file_path = "your_dataset.csv"  # Replace with actual file name

# Load dataset
df = load_data(file_path)

# Summarize data
summarize_data(df)

# Check missing values
missing_summary = check_missing_values(df)
print(missing_summary)

# Define target column (Replace with actual column name)
target_column = "target_variable"

# Train model (example: Random Forest)
rf_pipeline, rf_metrics = train_model(df, target_column, model_type="random_forest")

# Display results
print("Model Performance:", rf_metrics)

FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'

## **Climate Merged Dataset**

In [None]:
# Loading the dataset
file_path = "climate_data_final_df.csv"
df_climate = load_data(file_path)

# Data summary
summarize_data(df_climate)

Shape: (6323, 29)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6323 entries, 0 to 6322
Data columns (total 29 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   Unnamed: 0                                                          6323 non-null   int64  
 1   Entity                                                              6323 non-null   object 
 2   Year                                                                6323 non-null   int64  
 3   Carbon dioxide emissions from buildings                             6323 non-null   float64
 4   Carbon dioxide emissions from industry                              6323 non-null   float64
 5   Carbon dioxide emissions from land use change and forestry          6323 non-null   float64
 6   Carbon dioxide emissions from other fuel combustion                 6323 non-null   flo

In [None]:
# Check missing values
missing_summary = check_missing_values(df_climate)
print(missing_summary)

                                                    Missing Values    Percent
Average Temperature                                            796  12.588961
Forest area (% of land area)                                   130   2.055986
Renewable energy consumption (% of total final ...              59   0.933101
Entity                                                           0   0.000000
Unnamed: 0                                                       0   0.000000
Carbon dioxide emissions from land use change a...               0   0.000000
Carbon dioxide emissions from other fuel combus...               0   0.000000
Carbon dioxide emissions from transport                          0   0.000000
Carbon dioxide emissions from manufacturing and...               0   0.000000
Fugitive emissions of carbon dioxide from energ...               0   0.000000
Year                                                             0   0.000000
Carbon dioxide emissions from buildings                         

**Temperature Prediction**

In [None]:
target_column = "Average Temperature"