# Ensemble Learning

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
df = pd.read_csv("World Energy Consumption.csv")
df

Unnamed: 0,country,year,iso_code,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
0,ASEAN (Ember),2000,,,,,,,,,...,0.000,,,,,,0.0,,0.0,
1,ASEAN (Ember),2001,,,,,,,,,...,0.000,,,,,,0.0,,0.0,
2,ASEAN (Ember),2002,,,,,,,,,...,0.000,,,,,,0.0,,0.0,
3,ASEAN (Ember),2003,,,,,,,,,...,0.000,,,,,,0.0,,0.0,
4,ASEAN (Ember),2004,,,,,,,,,...,0.000,,,,,,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22007,Zimbabwe,2018,ZWE,15052191.0,2.271535e+10,,,,,25.910,...,0.218,,,,,0.0,0.0,,0.0,
22008,Zimbabwe,2019,ZWE,15354606.0,,,,,,24.748,...,0.364,,,,,0.0,0.0,,0.0,
22009,Zimbabwe,2020,ZWE,15669663.0,,,,,,22.336,...,0.395,,,,,0.0,0.0,,0.0,
22010,Zimbabwe,2021,ZWE,15993525.0,,,,,,23.760,...,0.498,,,,,0.0,0.0,,0.0,


In [3]:
df["Total Electricity"] = (
    df["biofuel_electricity"] +
    df["electricity_generation"] +
    df["fossil_electricity"] +
    df["gas_electricity"] +
    df["hydro_electricity"]
)

df = df[df["year"] == 2000]
df = df[['population', 'gdp', 'Total Electricity']]
df.dropna(inplace = True)
df

Unnamed: 0,population,gdp,Total Electricity
123,1.954299e+07,1.128379e+10,0.94
566,3.182027e+06,1.521426e+10,9.38
688,3.077462e+07,2.085541e+11,69.45
853,1.639407e+07,2.351202e+10,2.80
1060,3.707077e+07,5.364810e+11,210.45
...,...,...,...
21296,7.900114e+07,2.196489e+11,57.48
21609,6.148899e+09,5.986666e+13,30032.65
21652,1.862870e+07,7.775042e+10,6.42
21867,9.891140e+06,1.496412e+10,15.52


In [4]:
median_electricity = df['Total Electricity'].mean()
df['class'] = df['Total Electricity'].apply(lambda x : 0 if x < median_electricity else 1)
df

Unnamed: 0,population,gdp,Total Electricity,class
123,1.954299e+07,1.128379e+10,0.94,0
566,3.182027e+06,1.521426e+10,9.38,0
688,3.077462e+07,2.085541e+11,69.45,0
853,1.639407e+07,2.351202e+10,2.80,0
1060,3.707077e+07,5.364810e+11,210.45,0
...,...,...,...,...
21296,7.900114e+07,2.196489e+11,57.48,0
21609,6.148899e+09,5.986666e+13,30032.65,1
21652,1.862870e+07,7.775042e+10,6.42,0
21867,9.891140e+06,1.496412e+10,15.52,0


In [5]:
from sklearn.model_selection import train_test_split

X = df[["population", "gdp"]]
y = df["class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(130, 2) (130,)
(33, 2) (33,)


In [7]:
from sklearn.ensemble import RandomForestClassifier as RFS

model = RFS(n_estimators = 100, random_state = 42)

model.fit(X_train, y_train)

In [12]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print(f"Bagging Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(classification_report(y_test, y_pred))

Bagging Accuracy: 84.85%
              precision    recall  f1-score   support

           0       0.82      1.00      0.90        23
           1       1.00      0.50      0.67        10

    accuracy                           0.85        33
   macro avg       0.91      0.75      0.78        33
weighted avg       0.88      0.85      0.83        33



In [13]:
# Boosting
from xgboost import XGBClassifier

# Train XGBoost model
model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(f"Boosting Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(classification_report(y_test, y_pred))

Boosting Accuracy: 93.94%
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        23
           1       1.00      0.80      0.89        10

    accuracy                           0.94        33
   macro avg       0.96      0.90      0.92        33
weighted avg       0.94      0.94      0.94        33



In [15]:
# Stacking
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier

base_models = [
    ('knn', KNN(n_neighbors=5)),
    ('svm', SVC(kernel='linear')),
    ('dt', DecisionTreeClassifier(max_depth=5))
]

meta_model = LogisticRegression()

model = StackingClassifier(estimators=base_models, final_estimator=meta_model)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f"Stacking Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(classification_report(y_test, y_pred))

Stacking Accuracy: 84.85%
              precision    recall  f1-score   support

           0       0.82      1.00      0.90        23
           1       1.00      0.50      0.67        10

    accuracy                           0.85        33
   macro avg       0.91      0.75      0.78        33
weighted avg       0.88      0.85      0.83        33

