In [None]:
# ESG Analysis Notebook
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pickle

# Load Data
esg_data = pd.read_csv("../data/collected_data.csv")
print("Data loaded successfully.")

# EDA
print("Data Overview:")
print(esg_data.head())

print("\nSummary Statistics:")
print(esg_data.describe())

# Check for Missing Values
print("\nMissing Values:")
print(esg_data.isnull().sum())

# Correlation Analysis
correlation = esg_data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

# Feature Engineering
print("\nFeature Engineering: One-hot encoding categorical features.")
esg_data_encoded = pd.get_dummies(esg_data, drop_first=True)

# Define Features and Target
X = esg_data_encoded.drop("ESG Score", axis=1)
y = esg_data_encoded["ESG Score"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining Data: {X_train.shape}, Testing Data: {X_test.shape}")

# Model Training
print("\nTraining Random Forest Regressor...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Model Evaluation
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Save the Model
with open("../models/esg_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)
print("Model saved as 'esg_model.pkl'")
