In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
import seaborn as sns

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv("/content/drive/MyDrive/DATA_SCIENCE_PROJECT/data_v2_full.csv", sep = ';', decimal=",")

In [4]:
# Set style for better-looking plots
try:
    plt.style.use('seaborn-v0_8-darkgrid')
except:
    try:
        plt.style.use('seaborn-darkgrid')
    except:
        plt.style.use('ggplot')
sns.set_palette("husl")

In [8]:
def generate_all_plots(df, output_dir='/content/drive/MyDrive/DATA_SCIENCE_PROJECT'):
    """
    Generate all plots needed for the presentation.

    Parameters:
    df: pandas DataFrame with your data
    output_dir: directory to save plots
    """
    import os
    os.makedirs(output_dir, exist_ok=True)

    # Data preprocessing (from original code)
    df['date'] = pd.to_datetime(df['date'])
    df["date"] = df["date"].dt.year
    df = df.rename(columns={"date": "year"})
    df = df.sort_values(by=["year", "day_of_year"])

    X = df.drop(columns=["entry", "rlm", "slp"])
    y = df["slp"]

    # Train-test split
    random.seed(123)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    feature_names = X.columns

    # Train Random Forest
    forest = RandomForestRegressor(random_state=1)
    forest.fit(X_train, y_train)

    # ===== PLOT 1: MDI Feature Importance with Error Bars =====
    important_features = pd.Series(data=forest.feature_importances_, index=feature_names)
    important_features = important_features.sort_values(ascending=False)

    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    std_series = pd.Series(std, index=feature_names)
    std_series = std_series.reindex(important_features.index)

    fig, ax = plt.subplots(figsize=(10, 8))
    important_features.plot.barh(xerr=std_series, ax=ax, color='steelblue', capsize=3)
    ax.set_title("Feature Importances using Mean Decrease in Impurity (MDI)",
                 fontsize=16, fontweight='bold', pad=20)
    ax.set_xlabel("Mean Decrease in Impurity", fontsize=12)
    ax.set_ylabel("Features", fontsize=12)
    ax.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/plot1_mdi_importance.png', dpi=300, bbox_inches='tight')
    plt.close()

    # ===== PLOT 2: Permutation Feature Importance =====
    result = permutation_importance(forest, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
    forest_importances = pd.Series(result.importances_mean, index=feature_names)
    forest_importances = forest_importances.sort_values(ascending=False)
    std_perm = pd.Series(result.importances_std, index=feature_names)
    std_perm = std_perm.reindex(forest_importances.index)

    fig, ax = plt.subplots(figsize=(10, 8))
    forest_importances.plot.barh(xerr=std_perm, ax=ax, color='coral', capsize=3)
    ax.set_title("Feature Importances using Permutation Importance",
                 fontsize=16, fontweight='bold', pad=20)
    ax.set_xlabel("Mean Accuracy Decrease", fontsize=12)
    ax.set_ylabel("Features", fontsize=12)
    ax.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/plot2_permutation_importance.png', dpi=300, bbox_inches='tight')
    plt.close()

    # ===== PLOT 3: Side-by-Side Comparison of MDI vs Permutation =====
    # Get top 10 features from each method
    top_n = 10
    top_mdi = important_features.head(top_n)
    top_perm = forest_importances.head(top_n)

    # Normalize for comparison (0-1 scale)
    top_mdi_norm = (top_mdi - top_mdi.min()) / (top_mdi.max() - top_mdi.min())
    top_perm_norm = (top_perm - top_perm.min()) / (top_perm.max() - top_perm.min())

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

    # MDI plot
    top_mdi.plot.barh(ax=ax1, color='steelblue')
    ax1.set_title("MDI Importance (Top 10)", fontsize=14, fontweight='bold')
    ax1.set_xlabel("Importance Score", fontsize=11)
    ax1.grid(axis='x', alpha=0.3)

    # Permutation plot
    top_perm.plot.barh(ax=ax2, color='coral')
    ax2.set_title("Permutation Importance (Top 10)", fontsize=14, fontweight='bold')
    ax2.set_xlabel("Importance Score", fontsize=11)
    ax2.grid(axis='x', alpha=0.3)

    plt.suptitle("Comparison: MDI vs Permutation Feature Importance",
                 fontsize=16, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/plot3_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()

    # ===== PLOT 4: Feature Selection Summary (Selected Features) =====
    selected_features = ['day_of_year', 'apparent_temperature_mean',
                        'temperature_2m_mean', 'apparent_temperature_max',
                        'temperature_2m_max', 'sunrise']

    # Get importance values for selected features
    selected_mdi = important_features[selected_features]
    selected_perm = forest_importances[selected_features]

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    selected_mdi.plot.bar(ax=ax1, color='steelblue', rot=45)
    ax1.set_title("Selected Features - MDI Importance", fontsize=13, fontweight='bold')
    ax1.set_ylabel("Importance Score", fontsize=11)
    ax1.grid(axis='y', alpha=0.3)

    selected_perm.plot.bar(ax=ax2, color='coral', rot=45)
    ax2.set_title("Selected Features - Permutation Importance", fontsize=13, fontweight='bold')
    ax2.set_ylabel("Importance Score", fontsize=11)
    ax2.grid(axis='y', alpha=0.3)

    plt.suptitle("Final Selected Features for Model", fontsize=15, fontweight='bold')
    plt.tight_layout()
    plt.savefig(f'{output_dir}/plot4_selected_features.png', dpi=300, bbox_inches='tight')
    plt.close()

    # ===== PLOT 5: Feature Reduction Impact =====
    original_count = len(feature_names)
    selected_count = len(selected_features)
    reduction_pct = ((original_count - selected_count) / original_count) * 100

    fig, ax = plt.subplots(figsize=(8, 6))
    categories = ['Original Features', 'Selected Features']
    counts = [original_count, selected_count]
    colors = ['lightcoral', 'lightgreen']

    bars = ax.bar(categories, counts, color=colors, edgecolor='black', linewidth=2)
    ax.set_ylabel('Number of Features', fontsize=12)
    ax.set_title(f'Feature Reduction: {reduction_pct:.1f}% Reduction\n'
                 f'({original_count} â†’ {selected_count} features)',
                 fontsize=14, fontweight='bold', pad=20)
    ax.grid(axis='y', alpha=0.3)

    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom', fontsize=12, fontweight='bold')

    plt.tight_layout()
    plt.savefig(f'{output_dir}/plot5_feature_reduction.png', dpi=300, bbox_inches='tight')
    plt.close()

    print(f"All plots saved to '{output_dir}' directory!")
    print(f"Generated {5} plots for presentation.")


In [9]:
generate_all_plots(df)

All plots saved to '/content/drive/MyDrive/DATA_SCIENCE_PROJECT' directory!
Generated 5 plots for presentation.


In [10]:
cols = ['date',
        'day_of_year',
        'slp',
        'apparent_temperature_mean',
        'temperature_2m_mean',
        'apparent_temperature_max',
        'temperature_2m_max',
        'sunrise']

df_reduced = df[cols]

In [None]:
import os

folder_name = '/content/drive/MyDrive/DATA_SCIENCE_PROJECT'
file_name = 'df_reduced.csv'

os.makedirs(folder_name, exist_ok=True)
full_path = os.path.join(folder_name, file_name)
df_reduced.to_csv(full_path, index=True, sep=';')
print(f"File 'df_clean.csv' saved in: {full_path}")