## 1 Import Libraries

In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
import lightgbm as lgb
import matplotlib.pyplot as plt

In [9]:
df = pd.read_excel('GreenFinanceData.xlsx', sheet_name = 'Sheet1')

## 2 Initial Model Training process

In [33]:
# 1. Load the Dataset
# Adjust this path if needed for your environment
df = pd.read_excel('GreenFinanceData.xlsx', sheet_name = 'Sheet1')

# 2. Separate Features & Target
target_col = "ESG_Score"

# Numeric Features (excluding target and text)
numeric_cols = [
    "Budget(USD_millions)",
    "EstimatedROI(%)",
    "EmissionsReductionPotential(tonsCO2eq/year)",
    "WaterConservationPotential(millionLiters/year)",
    "RiskFactor(1-5)",
    "LocalRainfall(mm/year)",
    "LocalAvgTemperature(C)",
    "GDPGrowthRate(%)"
]

# Text Column to apply NLP on
text_col = "ProjectDescription"

X = df[numeric_cols + [text_col]]
y = df[target_col]

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=42
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("text", TfidfVectorizer(), text_col)
    ]
)

# 4. Build pipeline: preprocessing + LGBM Regressor
model = make_pipeline(
    preprocessor,
    lgb.LGBMRegressor(random_state=42)
)

# 5. Train the model
model.fit(X_train, y_train)

# 6. Evaluate on test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("LightGBM Regressor (with NLP on ProjectDescription) Results:")
print(f" - MAE (Mean Absolute Error): {mae:.3f}")
print(f" - R^2 Score: {r2:.3f}")

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 7, number of used features: 0
[LightGBM] [Info] Start training from score 7.571429
LightGBM Regressor (with NLP on ProjectDescription) Results:
 - MAE (Mean Absolute Error): 1.190
 - R^2 Score: -0.036


In [None]:
lgbm_step = model.named_steps["lgbmregressor"]

importances = lgbm_step.feature_importances_
print(f"\nTotal number of features: {importances.shape[0]}")

# -------------------------------------------------------------------
#   OPTIONAL: MAPPING FEATURE IMPORTANCES BACK TO NAMES
# -------------------------------------------------------------------
# The final feature set is a concatenation of:
#   - Scaled numeric columns
#   - TF-IDF columns
#
# We'll collect their names so we can see which numeric columns
# or TF-IDF tokens are deemed most important.

# 1) Numeric feature names come directly from numeric_cols
numeric_feature_names = numeric_cols

# 2) TF-IDF feature names:
fitted_ct = model.named_steps["columntransformer"]      # The ColumnTransformer
tfidf     = fitted_ct.named_transformers_["text"]       # The TfidfVectorizer
text_feature_names = tfidf.get_feature_names_out()      # e.g., array(['affordable', 'agricultural', 'aimed', ...])

# Combine them in the same order the ColumnTransformer produces
all_feature_names = list(numeric_feature_names) + list(text_feature_names)

# Now, create a mapping of feature -> importance
feature_importance_pairs = sorted(
    zip(all_feature_names, importances),
    key=lambda x: x[1],
    reverse=True
)

# Show top 10 for illustration
print("\nTop 10 Most Important Features (Numeric + TF-IDF):")
for feat, imp in feature_importance_pairs[:10]:
    print(f"  {feat}: {imp}")

# -------------------------------------------------------------------
#   OPTIONAL: FEATURE SELECTION
# -------------------------------------------------------------------
# Suppose we only keep features with importance > 0. This is simplistic.
selected_pairs = [pair for pair in feature_importance_pairs if pair[1] > 0]
selected_features = [pair[0] for pair in selected_pairs]

# print(f"\nNumber of selected features with importance > 0: {len(selected_features)}")

## Train Model With Selected Features

In [None]:
X_train_fs = X_train[selected_features]
X_test_fs = X_test[selected_features]

lgbm_regressor_fs = lgb.LGBMRegressor(random_state=42)
lgbm_regressor_fs.fit(X_train_fs, y_train)

y_pred_fs = lgbm_regressor_fs.predict(X_test_fs)
mae_fs = mean_absolute_error(y_test, y_pred_fs)
r2_fs = r2_score(y_test, y_pred_fs)

In [None]:
print("\nModel Performance After Feature Selection:")
print(f"  - MAE: {mae_fs:.3f}")
print(f"  - R^2 : {r2_fs:.3f}")