In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier

# Load dataset
df = pd.read_csv("climate_risk_dataset.csv")
df.head()


ModuleNotFoundError: No module named 'pandas'

In [2]:

# Basic Info
print("Shape of dataset:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Distribution of numerical columns
df.hist(figsize=(12, 8), bins=30)
plt.tight_layout()
plt.show()

# Correlation Heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


NameError: name 'df' is not defined

In [None]:

# Handle missing values (drop or fill with mean for numerical, mode for categorical)
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].mean())

# Encode categorical variables
label_enc = LabelEncoder()
for col in df.select_dtypes(include="object").columns:
    df[col] = label_enc.fit_transform(df[col])

# Standardize numerical features
scaler = StandardScaler()
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = scaler.fit_transform(df[num_cols])

df.head()


In [3]:

# Assume target variable is 'Fire' (change if dataset column differs)
target_col = "Fire"
X = df.drop(columns=[target_col], errors="ignore")
y = df[target_col] if target_col in df.columns else None

if y is not None:
    # Method 1: Correlation check
    corr = df.corr()[target_col].sort_values(ascending=False)
    print("Correlation with target:\n", corr)

    # Method 2: SelectKBest
    selector = SelectKBest(score_func=f_classif, k=5)
    X_new = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()]
    print("\nTop 5 Features (SelectKBest):", selected_features.tolist())

    # Method 3: Feature Importance using RandomForest
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X, y)
    importances = pd.Series(rf.feature_importances_, index=X.columns)
    print("\nRandomForest Feature Importances:\n", importances.sort_values(ascending=False).head(10))

    # Plot feature importance
    importances.sort_values(ascending=False).head(10).plot(kind="barh", figsize=(8,5))
    plt.title("Top 10 Important Features")
    plt.show()
else:
    print("⚠️ Target column 'Fire' not found. Please adjust target_col variable.")


NameError: name 'df' is not defined