# Steel Industry Energy Consumption Prediction
## AI-Powered Energy Analytics for Sustainable Manufacturing

**Author:** [Your Name]  
**Student ID:** [Your Student ID]  
**Module:** M516 Business Project in Big Data & AI  
**Date:** December 2025

---

### Project Overview

This project develops an AI-based system for predicting energy consumption in steel manufacturing, supporting sustainability goals through:

1. **Energy Consumption Prediction** - Regression models to forecast kWh usage
2. **Load Type Classification** - Classify operational load (Light/Medium/Maximum)
3. **Sustainability Insights** - CO2 emission analysis and optimization recommendations

---
## Step 1: Import Libraries and Setup

In [None]:
# Standard Libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score,
                             accuracy_score, classification_report, confusion_matrix)

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("All libraries imported successfully!")

---
## Step 2: Load Dataset

In [None]:
# Load the Steel Industry Energy Consumption Dataset
df = pd.read_csv('data/Steel_industry_data.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst 5 Rows:")
df.head()

In [None]:
# Dataset Information
print("Dataset Information:")
print("="*50)
df.info()

In [None]:
# Check for missing values
print("Missing Values:")
print("="*50)
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "No missing values found!")

---
## Step 3: Exploratory Data Analysis (EDA)

In [None]:
# Statistical Summary of Numerical Features
print("Statistical Summary:")
print("="*70)
df.describe().T

In [None]:
# Distribution of Target Variable (Energy Consumption)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['Usage_kWh'], bins=50, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].set_xlabel('Energy Consumption (kWh)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Energy Consumption', fontsize=14)
axes[0].axvline(df['Usage_kWh'].mean(), color='red', linestyle='--', label=f'Mean: {df["Usage_kWh"].mean():.2f}')
axes[0].legend()

# Box Plot
axes[1].boxplot(df['Usage_kWh'], vert=True)
axes[1].set_ylabel('Energy Consumption (kWh)', fontsize=12)
axes[1].set_title('Box Plot of Energy Consumption', fontsize=14)

plt.tight_layout()
plt.savefig('results/figures/energy_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Load Type Distribution
fig, ax = plt.subplots(figsize=(10, 6))

load_counts = df['Load_Type'].value_counts()
colors = ['#2ecc71', '#f39c12', '#e74c3c']
bars = ax.bar(load_counts.index, load_counts.values, color=colors, edgecolor='black')

ax.set_xlabel('Load Type', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('Distribution of Load Types', fontsize=14)

# Add value labels
for bar, val in zip(bars, load_counts.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 200, 
            f'{val:,}', ha='center', va='bottom', fontsize=11)

plt.tight_layout()
plt.savefig('results/figures/load_type_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nLoad Type Percentages:")
print((load_counts / len(df) * 100).round(2))

In [None]:
# Correlation Matrix
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation = df[numeric_cols].corr()

fig, ax = plt.subplots(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation, dtype=bool))
sns.heatmap(correlation, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r', 
            center=0, ax=ax, square=True, linewidths=0.5)
ax.set_title('Correlation Matrix - Steel Industry Energy Data', fontsize=14)

plt.tight_layout()
plt.savefig('results/figures/correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Energy Consumption by Load Type
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

load_order = ['Light_Load', 'Medium_Load', 'Maximum_Load']

# Box plot
sns.boxplot(x='Load_Type', y='Usage_kWh', data=df, order=load_order, ax=axes[0], palette='Set2')
axes[0].set_title('Energy Consumption by Load Type', fontsize=14)
axes[0].set_xlabel('Load Type', fontsize=12)
axes[0].set_ylabel('Energy Consumption (kWh)', fontsize=12)

# Violin plot
sns.violinplot(x='Load_Type', y='Usage_kWh', data=df, order=load_order, ax=axes[1], palette='Set2')
axes[1].set_title('Energy Consumption Distribution by Load Type', fontsize=14)
axes[1].set_xlabel('Load Type', fontsize=12)
axes[1].set_ylabel('Energy Consumption (kWh)', fontsize=12)

plt.tight_layout()
plt.savefig('results/figures/energy_by_load_type.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Time-based Analysis
df['Hour'] = (df['NSM'] / 3600).astype(int)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Hourly average consumption
hourly_avg = df.groupby('Hour')['Usage_kWh'].mean()
axes[0, 0].plot(hourly_avg.index, hourly_avg.values, marker='o', linewidth=2, color='steelblue')
axes[0, 0].fill_between(hourly_avg.index, hourly_avg.values, alpha=0.3)
axes[0, 0].set_xlabel('Hour of Day', fontsize=12)
axes[0, 0].set_ylabel('Average Energy (kWh)', fontsize=12)
axes[0, 0].set_title('Average Energy Consumption by Hour', fontsize=14)
axes[0, 0].set_xticks(range(0, 24, 2))

# Daily consumption by weekday
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_avg = df.groupby('Day_of_week')['Usage_kWh'].mean().reindex(day_order)
axes[0, 1].bar(daily_avg.index, daily_avg.values, color='steelblue', edgecolor='black')
axes[0, 1].set_xlabel('Day of Week', fontsize=12)
axes[0, 1].set_ylabel('Average Energy (kWh)', fontsize=12)
axes[0, 1].set_title('Average Energy Consumption by Day', fontsize=14)
axes[0, 1].tick_params(axis='x', rotation=45)

# Weekend vs Weekday
week_status_avg = df.groupby('WeekStatus')['Usage_kWh'].mean()
axes[1, 0].bar(week_status_avg.index, week_status_avg.values, 
               color=['coral', 'lightgreen'], edgecolor='black')
axes[1, 0].set_xlabel('Week Status', fontsize=12)
axes[1, 0].set_ylabel('Average Energy (kWh)', fontsize=12)
axes[1, 0].set_title('Energy Consumption: Weekday vs Weekend', fontsize=14)

# CO2 vs Energy consumption
sample = df.sample(min(2000, len(df)), random_state=42)
axes[1, 1].scatter(sample['Usage_kWh'], sample['CO2(tCO2)'], alpha=0.5, s=20, c='green')
axes[1, 1].set_xlabel('Energy Consumption (kWh)', fontsize=12)
axes[1, 1].set_ylabel('CO2 Emissions (tCO2)', fontsize=12)
axes[1, 1].set_title('Energy Consumption vs CO2 Emissions', fontsize=14)

plt.tight_layout()
plt.savefig('results/figures/time_series_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

---
## Step 4: Data Preprocessing

In [None]:
# Feature Engineering
def preprocess_data(df):
    """Preprocess the dataset for modeling."""
    df = df.copy()
    
    # Extract hour from NSM
    df['Hour'] = (df['NSM'] / 3600).astype(int)
    
    # Create time period
    def get_time_period(hour):
        if 0 <= hour < 6:
            return 'Night'
        elif 6 <= hour < 12:
            return 'Morning'
        elif 12 <= hour < 18:
            return 'Afternoon'
        else:
            return 'Evening'
    
    df['Time_Period'] = df['Hour'].apply(get_time_period)
    
    # Is Peak Hour (8 AM - 6 PM on weekdays)
    df['Is_Peak_Hour'] = ((df['Hour'] >= 8) & (df['Hour'] <= 18) & 
                          (df['WeekStatus'] == 'Weekday')).astype(int)
    
    # Encode categorical variables
    le_week = LabelEncoder()
    le_day = LabelEncoder()
    le_time = LabelEncoder()
    le_load = LabelEncoder()
    
    df['WeekStatus_encoded'] = le_week.fit_transform(df['WeekStatus'])
    df['Day_of_week_encoded'] = le_day.fit_transform(df['Day_of_week'])
    df['Time_Period_encoded'] = le_time.fit_transform(df['Time_Period'])
    df['Load_Type_encoded'] = le_load.fit_transform(df['Load_Type'])
    
    return df, le_load

# Apply preprocessing
df_processed, load_encoder = preprocess_data(df)
print("Preprocessing completed!")
print(f"\nNew features created: Hour, Time_Period, Is_Peak_Hour")
print(f"Encoded columns: WeekStatus, Day_of_week, Time_Period, Load_Type")

In [None]:
# Define feature columns for regression
feature_cols_regression = [
    'Lagging_Current_Reactive.Power_kVarh',
    'Leading_Current_Reactive_Power_kVarh',
    'CO2(tCO2)',
    'Lagging_Current_Power_Factor',
    'Leading_Current_Power_Factor',
    'NSM',
    'Hour',
    'Is_Peak_Hour',
    'WeekStatus_encoded',
    'Day_of_week_encoded',
    'Time_Period_encoded',
    'Load_Type_encoded'
]

# Define feature columns for classification (exclude Load_Type)
feature_cols_classification = [
    'Usage_kWh',
    'Lagging_Current_Reactive.Power_kVarh',
    'Leading_Current_Reactive_Power_kVarh',
    'CO2(tCO2)',
    'Lagging_Current_Power_Factor',
    'Leading_Current_Power_Factor',
    'NSM',
    'Hour',
    'Is_Peak_Hour',
    'WeekStatus_encoded',
    'Day_of_week_encoded',
    'Time_Period_encoded'
]

print(f"Regression Features: {len(feature_cols_regression)}")
print(f"Classification Features: {len(feature_cols_classification)}")

---
## Step 5: Model Training - Energy Consumption Prediction (Regression)

In [None]:
# Prepare data for regression
X_reg = df_processed[feature_cols_regression]
y_reg = df_processed['Usage_kWh']

# Train-test split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

print(f"Training set: {X_train_reg.shape[0]} samples")
print(f"Test set: {X_test_reg.shape[0]} samples")

In [None]:
# Define regression models
regression_models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, verbosity=0)
}

# Train and evaluate models
reg_results = []

print("Training Regression Models...")
print("="*70)

for name, model in regression_models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train_reg, y_train_reg)
    
    # Predictions
    y_pred = model.predict(X_test_reg)
    
    # Calculate metrics
    mse = mean_squared_error(y_test_reg, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test_reg, y_pred)
    r2 = r2_score(y_test_reg, y_pred)
    
    reg_results.append({
        'Model': name,
        'R2_Score': round(r2, 4),
        'RMSE': round(rmse, 4),
        'MAE': round(mae, 4),
        'MSE': round(mse, 4)
    })
    
    print(f"  R2 Score: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")

# Create results dataframe
reg_results_df = pd.DataFrame(reg_results).sort_values('R2_Score', ascending=False)
print("\n" + "="*70)
print("REGRESSION MODEL COMPARISON")
print("="*70)
reg_results_df

In [None]:
# Visualize Regression Results
fig, ax = plt.subplots(figsize=(10, 6))

colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(reg_results_df)))
bars = ax.barh(reg_results_df['Model'], reg_results_df['R2_Score'], color=colors, edgecolor='black')

ax.set_xlabel('RÂ² Score', fontsize=12)
ax.set_title('Regression Model Comparison - RÂ² Score', fontsize=14)

for bar, val in zip(bars, reg_results_df['R2_Score']):
    ax.text(val + 0.01, bar.get_y() + bar.get_height()/2, f'{val:.4f}', va='center', fontsize=11)

plt.tight_layout()
plt.savefig('results/figures/regression_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Best Model - Actual vs Predicted
best_model = regression_models['XGBoost']  # Using XGBoost as best performer
y_pred_best = best_model.predict(X_test_reg)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
axes[0].scatter(y_test_reg, y_pred_best, alpha=0.5, s=20, c='steelblue')
axes[0].plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 
             'r--', linewidth=2, label='Perfect Prediction')
axes[0].set_xlabel('Actual Energy (kWh)', fontsize=12)
axes[0].set_ylabel('Predicted Energy (kWh)', fontsize=12)
axes[0].set_title('Actual vs Predicted - XGBoost', fontsize=14)
axes[0].legend()

# Residual plot
residuals = y_test_reg - y_pred_best
axes[1].scatter(y_pred_best, residuals, alpha=0.5, s=20, c='green')
axes[1].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[1].set_xlabel('Predicted Energy (kWh)', fontsize=12)
axes[1].set_ylabel('Residuals', fontsize=12)
axes[1].set_title('Residual Plot - XGBoost', fontsize=14)

plt.tight_layout()
plt.savefig('results/figures/actual_vs_predicted.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Feature Importance - XGBoost
importance = best_model.feature_importances_
feature_importance = dict(zip(feature_cols_regression, importance))
sorted_importance = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True))

fig, ax = plt.subplots(figsize=(10, 8))

features = list(sorted_importance.keys())
importances = list(sorted_importance.values())
colors = plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(features)))[::-1]

ax.barh(features, importances, color=colors, edgecolor='black')
ax.set_xlabel('Importance', fontsize=12)
ax.set_title('Feature Importance - XGBoost Regressor', fontsize=14)
ax.invert_yaxis()

plt.tight_layout()
plt.savefig('results/figures/feature_importance_regression.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nTop 5 Most Important Features:")
for i, (feat, imp) in enumerate(list(sorted_importance.items())[:5], 1):
    print(f"{i}. {feat}: {imp:.4f}")

---
## Step 6: Model Training - Load Type Classification

In [None]:
# Prepare data for classification
X_clf = df_processed[feature_cols_classification]
y_clf = df_processed['Load_Type_encoded']

# Train-test split
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

print(f"Training set: {X_train_clf.shape[0]} samples")
print(f"Test set: {X_test_clf.shape[0]} samples")
print(f"\nClass distribution in test set:")
print(pd.Series(y_test_clf).value_counts().sort_index())

In [None]:
# Define classification models
classification_models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, verbosity=0, eval_metric='mlogloss')
}

# Train and evaluate models
clf_results = []

print("Training Classification Models...")
print("="*70)

for name, model in classification_models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train_clf, y_train_clf)
    
    # Predictions
    y_pred = model.predict(X_test_clf)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test_clf, y_pred)
    
    clf_results.append({
        'Model': name,
        'Accuracy': round(accuracy, 4)
    })
    
    print(f"  Accuracy: {accuracy:.4f}")

# Create results dataframe
clf_results_df = pd.DataFrame(clf_results).sort_values('Accuracy', ascending=False)
print("\n" + "="*70)
print("CLASSIFICATION MODEL COMPARISON")
print("="*70)
clf_results_df

In [None]:
# Best Classification Model - Confusion Matrix
best_clf_model = classification_models['XGBoost']
y_pred_clf = best_clf_model.predict(X_test_clf)

# Get class names
class_names = load_encoder.classes_

# Confusion Matrix
cm = confusion_matrix(y_test_clf, y_pred_clf)

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=class_names, yticklabels=class_names)
ax.set_xlabel('Predicted Label', fontsize=12)
ax.set_ylabel('True Label', fontsize=12)
ax.set_title('Confusion Matrix - XGBoost Classifier', fontsize=14)

plt.tight_layout()
plt.savefig('results/figures/confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

# Classification Report
print("\nClassification Report:")
print("="*60)
print(classification_report(y_test_clf, y_pred_clf, target_names=class_names))

In [None]:
# Visualize Classification Results
fig, ax = plt.subplots(figsize=(10, 6))

colors = plt.cm.plasma(np.linspace(0.2, 0.8, len(clf_results_df)))
bars = ax.barh(clf_results_df['Model'], clf_results_df['Accuracy'], color=colors, edgecolor='black')

ax.set_xlabel('Accuracy', fontsize=12)
ax.set_title('Classification Model Comparison - Accuracy', fontsize=14)
ax.set_xlim(0, 1.1)

for bar, val in zip(bars, clf_results_df['Accuracy']):
    ax.text(val + 0.01, bar.get_y() + bar.get_height()/2, f'{val:.4f}', va='center', fontsize=11)

plt.tight_layout()
plt.savefig('results/figures/classification_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

---
## Step 7: Sustainability Insights - CO2 Analysis

In [None]:
# CO2 Emissions Analysis
print("CO2 Emissions Analysis")
print("="*60)

# Correlation between CO2 and Energy
co2_energy_corr = df['CO2(tCO2)'].corr(df['Usage_kWh'])
print(f"\nCorrelation between CO2 and Energy: {co2_energy_corr:.4f}")

# CO2 by Load Type
co2_by_load = df.groupby('Load_Type')['CO2(tCO2)'].agg(['mean', 'sum']).round(4)
print("\nCO2 Emissions by Load Type:")
print(co2_by_load)

# CO2 by Week Status
co2_by_week = df.groupby('WeekStatus')['CO2(tCO2)'].agg(['mean', 'sum']).round(4)
print("\nCO2 Emissions by Week Status:")
print(co2_by_week)

In [None]:
# Sustainability Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# CO2 by Load Type
co2_mean = df.groupby('Load_Type')['CO2(tCO2)'].mean().reindex(load_order)
colors = ['#2ecc71', '#f39c12', '#e74c3c']
axes[0].bar(co2_mean.index, co2_mean.values * 1000, color=colors, edgecolor='black')  # Convert to kg
axes[0].set_xlabel('Load Type', fontsize=12)
axes[0].set_ylabel('Average CO2 Emissions (kg)', fontsize=12)
axes[0].set_title('Average CO2 Emissions by Load Type', fontsize=14)

# Hourly CO2 emissions
hourly_co2 = df.groupby('Hour')['CO2(tCO2)'].mean() * 1000
axes[1].fill_between(hourly_co2.index, hourly_co2.values, alpha=0.5, color='green')
axes[1].plot(hourly_co2.index, hourly_co2.values, marker='o', linewidth=2, color='darkgreen')
axes[1].set_xlabel('Hour of Day', fontsize=12)
axes[1].set_ylabel('Average CO2 Emissions (kg)', fontsize=12)
axes[1].set_title('Average CO2 Emissions by Hour', fontsize=14)
axes[1].set_xticks(range(0, 24, 2))

plt.tight_layout()
plt.savefig('results/figures/co2_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

---
## Step 8: Summary and Conclusions

In [None]:
print("="*70)
print("PROJECT SUMMARY")
print("="*70)

print("\nðŸ“Š DATASET OVERVIEW:")
print(f"   - Total Records: {len(df):,}")
print(f"   - Features: {len(df.columns)}")
print(f"   - Time Period: Full year 2018 (15-minute intervals)")

print("\nðŸ“ˆ REGRESSION RESULTS (Energy Prediction):")
best_reg = reg_results_df.iloc[0]
print(f"   - Best Model: {best_reg['Model']}")
print(f"   - RÂ² Score: {best_reg['R2_Score']:.4f}")
print(f"   - RMSE: {best_reg['RMSE']:.4f} kWh")
print(f"   - MAE: {best_reg['MAE']:.4f} kWh")

print("\nðŸŽ¯ CLASSIFICATION RESULTS (Load Type):")
best_clf = clf_results_df.iloc[0]
print(f"   - Best Model: {best_clf['Model']}")
print(f"   - Accuracy: {best_clf['Accuracy']:.4f} ({best_clf['Accuracy']*100:.2f}%)")

print("\nðŸŒ± SUSTAINABILITY INSIGHTS:")
print(f"   - CO2-Energy Correlation: {co2_energy_corr:.4f}")
print(f"   - Maximum Load produces highest CO2 emissions")
print(f"   - Peak hours (8 AM - 6 PM) have highest energy consumption")

print("\nðŸ’¡ RECOMMENDATIONS:")
print("   1. Schedule heavy operations during off-peak hours")
print("   2. Monitor and optimize Maximum Load operations")
print("   3. Implement predictive maintenance using the ML models")
print("   4. Use real-time predictions for energy management")

print("\n" + "="*70)

In [None]:
# Save results to CSV
reg_results_df.to_csv('results/regression_results.csv', index=False)
clf_results_df.to_csv('results/classification_results.csv', index=False)

print("Results saved successfully!")
print("- results/regression_results.csv")
print("- results/classification_results.csv")

---
## References

1. Sathishkumar V E, Shin C., Cho Y., "Efficient energy consumption prediction model for a data analytic-enabled industry building in a smart city", Building Research & Information, 2021.

2. Dataset Source: Kaggle - Steel Industry Energy Consumption  
   URL: https://www.kaggle.com/datasets/csafrit2/steel-industry-energy-consumption

3. Scikit-learn Documentation: https://scikit-learn.org/stable/

4. XGBoost Documentation: https://xgboost.readthedocs.io/