# Step 1: Importing Basic Libraries 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb 


# Dataset Integrity VerificationThe SHA-256 hash of the dataset used in this notebook is:f7b7b240b5a2684033844cd61da732003b7e305a62ed523418112fc1de48fdbfThis ensures the dataset's integrity and authenticity
.

In [None]:
import pandas as pd
import hashlib

# Loading the dataset
df = pd.read_csv("/kaggle/input/renewable-energy-usage-in-usa/Renewable energy usage in USA.csv")

# Converting the DataFrame to a CSV string without the index
data_string = df.to_csv(index=False)

# Computing the SHA-256 hash of the dataset
hash_value = hashlib.sha256(data_string.encode()).hexdigest()

# Printing the hash
print(f"SHA-256 Hash of the Dataset: {hash_value}")


# Step 3: Displaying Dataset

In [None]:
df.head()

In [None]:
print(df.info())

# Step 4: Normalizing Data

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Step 1: Selecting all numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Step 2: Excluding 'Year', 'Month', and the target variable (if applicable, e.g., 'Class')
columns_to_exclude = ['Year', 'Month', 'Sector_Commerical', 'Sector_Electric Power', 'Sector_Industrial', 'Sector_Residential',	'Sector_Transportation']  
columns_to_normalize = [col for col in numerical_cols if col not in columns_to_exclude]

# Step 3: Normalizing the selected columns
scaler = MinMaxScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

# Displaying the normalized dataset
df.head(10)

# Step 5: EDA 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Group data by 'Month' and calculate the mean only for numeric columns
monthly_trends = df.groupby('Month').mean(numeric_only=True)

# Plotting the monthly trends for specified energy types
ax = monthly_trends[['Hydroelectric Power', 'Geothermal Energy', 'Solar Energy', 
                     'Wind Energy', 'Wood Energy', 'Waste Energy', 
                     'Fuel Ethanol, Excluding Denaturant', 'Biomass Losses and Co-products', 
                     'Conventional Hydroelectric Power', 'Biodiesel']].plot(kind='line', 
                                                                           figsize=(10, 5), 
                                                                           marker='o')

plt.title('Monthly Average Energy Consumption')
plt.xlabel('Month')
plt.ylabel('Average Consumption')

plt.text(0.5, 0.5, "Mohsin Ali Fida's Notebook", 
         fontsize=30, color='gray', alpha=0.3, 
         ha='center', va='center', rotation=30, transform=ax.transAxes)

plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Selecting only numeric columns
numeric_data = df.select_dtypes(include=[np.number])

# Calculating the correlation matrix
correlation_matrix = numeric_data.corr()

# Creating the heatmap
plt.figure(figsize=(6, 3))
ax = sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")

plt.title("Correlation Matrix")

plt.text(0.5, 0.5, "Notebook by Mohsin ALi", 
         fontsize=30, color='gray', alpha=0.3, 
         ha='center', va='center', rotation=30, transform=ax.transAxes)

plt.show()


In [None]:
data.head()

In [None]:
# Checking if the Season columns are one-hot encoded and decode to a categorical column
if 'Season_Spring' in data.columns:
    # Decode one-hot columns back to a single categorical column
    data['Season'] = data[['Season_Spring', 'Season_Summer', 'Season_Winter']].idxmax(axis=1)
    # Map the one-hot column names to season names
    data['Season'] = data['Season'].map({
        'Season_Spring': 'Spring',
        'Season_Summer': 'Summer',
        'Season_Winter': 'Winter'
    })

# Defining a custom color palette with dark green, dark red, and dark blue
custom_palette = {'Spring': '#2ca02c', 'Summer': '#d62728', 'Winter': '#1f77b4'}

# Plotting the boxplot with the custom color palette
plt.figure(figsize=(7, 5))
ax = sns.boxplot(data=data, x='Season', y='Total Renewable Energy', palette=custom_palette)

# Adding title and labels
plt.title('Total Renewable Energy Consumption by Season')
plt.ylabel('Total Renewable Energy Consumption')
plt.xlabel('Season')


plt.text(0.5, 0.5, "Kaggle Notebook by Mohsin Ali", 
         fontsize=30, color='gray', alpha=0.3, 
         ha='center', va='center', rotation=30, transform=ax.transAxes)

plt.show()


In [None]:
# Creating a month-season mapping for each row
month_season_data = data.groupby(['Year', 'Month']).agg({
    'Total Renewable Energy': 'mean',
    'Season_Spring': 'first',
    'Season_Summer': 'first',
    'Season_Winter': 'first'
}).reset_index()

# Assigning seasons based on the binary columns
month_season_data['Season'] = month_season_data.apply(
    lambda row: 'Spring' if row['Season_Spring'] else 
                ('Summer' if row['Season_Summer'] else 'Winter'), axis=1
)

# Ploting the data
plt.figure(figsize=(8, 4))
ax = sns.lineplot(data=month_season_data, x='Month', y='Total Renewable Energy', hue='Season', 
                  style='Season', markers=True, palette='viridis')

# Add titles and labels
plt.title('Monthly Renewable Energy Consumption by Season')
plt.xlabel('Month')
plt.ylabel('Total Renewable Energy Consumption')


plt.text(0.5, 0.5, "Kaggle Notebook by Mohsin ALi", 
         fontsize=30, color='gray', alpha=0.3, 
         ha='center', va='center', rotation=30, transform=ax.transAxes)


plt.show()


In [None]:
# Creating a pivot table for 'Total Renewable Energy' by 'Year' and 'Month'
pivot_data = data.pivot_table(values='Total Renewable Energy', index='Year', columns='Month', aggfunc='mean')

# Ploting the heatmap
plt.figure(figsize=(8, 5))
ax = sns.heatmap(pivot_data, cmap='YlGnBu', annot=False, cbar_kws={'label': 'Total Renewable Energy'})

# Adding titles and labels
plt.title('Seasonal Renewable Energy Consumption Heatmap')
plt.xlabel('Month')
plt.ylabel('Year')


plt.text(0.5, 0.5, "Kaggle Notebook by Mohsin Ali", 
         fontsize=30, color='gray', alpha=0.3, 
         ha='center', va='center', rotation=30, transform=ax.transAxes)

plt.show()


# Step 5: Building Random Forest Model

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

# Step 1: Preparing the data
# Converting categorical features to numeric (excluding 'Sector' as it's the target)
data_encoded = pd.get_dummies(data.drop(columns=['Sector']), drop_first=True)
X = data_encoded  # Features
y = data['Sector']  # Target variable

# Step 2: Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Training the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 4: Making predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Step 5: Evaluating the model on the test set
print("Random Forest Model Performance on Test Set:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Step 6: Performing 5-fold cross-validation for different evaluation metrics
accuracy_scores_rf = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')
precision_scores_rf = cross_val_score(rf_model, X, y, cv=5, scoring='precision_macro')
recall_scores_rf = cross_val_score(rf_model, X, y, cv=5, scoring='recall_macro')
f1_scores_rf = cross_val_score(rf_model, X, y, cv=5, scoring='f1_macro')

# Step 7: Printing the average scores and standard deviations for cross-validation
print("\n5-Fold Cross-Validation Results for Random Forest:")
print("Accuracy: {:.2f} ± {:.2f}".format(np.mean(accuracy_scores_rf), np.std(accuracy_scores_rf)))
print("Precision: {:.2f} ± {:.2f}".format(np.mean(precision_scores_rf), np.std(precision_scores_rf)))
print("Recall: {:.2f} ± {:.2f}".format(np.mean(recall_scores_rf), np.std(recall_scores_rf)))
print("F1 Score: {:.2f} ± {:.2f}".format(np.mean(f1_scores_rf), np.std(f1_scores_rf)))


# Step 5: Building SVM Model

In [None]:
#SVM Model 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Step 1: Prepare the data (reusing `data_encoded` from Random Forest step)
X_SVM = data_encoded  # Features
y_SVM = data['Sector']  # Target variable

# Step 2: Split the data into training and testing sets
X_train_SVM, X_test_SVM, y_train_SVM, y_test_SVM = train_test_split(X_SVM, y_SVM, test_size=0.2, random_state=42)

# Step 3: Standardize the feature set (SVM works better with scaled features)
scaler = StandardScaler()
X_train_SVM = scaler.fit_transform(X_train_SVM)
X_test_SVM = scaler.transform(X_test_SVM)

# Step 4: Hyperparameter tuning with GridSearchCV
# Defining the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Initialize the SVM model
svm_model = SVC()

# Initialize GridSearchCV
grid_search_SVM = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the model to find the best parameters
grid_search_SVM.fit(X_train_SVM, y_train_SVM)

# Step 5: Get the best estimator from GridSearchCV
best_svm_model = grid_search_SVM.best_estimator_
print("Best SVM parameters:", grid_search_SVM.best_params_)


In [None]:

# Step 6: Evaluating the best model on the test set
y_pred_SVM = best_svm_model.predict(X_test_SVM)
print("\nSVM Model Performance on Test Set:")
print("Accuracy:", accuracy_score(y_test_SVM, y_pred_SVM))
print("Classification Report:\n", classification_report(y_test_SVM, y_pred_SVM))

# Step 7: Cross-Validation with the best model
from sklearn.model_selection import cross_val_score

# Perforing 5-fold cross-validation for different evaluation metrics
accuracy_scores_SVM = cross_val_score(best_svm_model, X_SVM, y_SVM, cv=5, scoring='accuracy')
precision_scores_SVM = cross_val_score(best_svm_model, X_SVM, y_SVM, cv=5, scoring='precision_macro')
recall_scores_SVM = cross_val_score(best_svm_model, X_SVM, y_SVM, cv=5, scoring='recall_macro')
f1_scores_SVM = cross_val_score(best_svm_model, X_SVM, y_SVM, cv=5, scoring='f1_macro')

# Printing the average scores and standard deviations
print("\n5-Fold Cross-Validation Results for SVM:")
print("Accuracy: {:.2f} ± {:.2f}".format(np.mean(accuracy_scores_SVM), np.std(accuracy_scores_SVM)))
print("Precision: {:.2f} ± {:.2f}".format(np.mean(precision_scores_SVM), np.std(precision_scores_SVM)))
print("Recall: {:.2f} ± {:.2f}".format(np.mean(recall_scores_SVM), np.std(recall_scores_SVM)))
print("F1 Score: {:.2f} ± {:.2f}".format(np.mean(f1_scores_SVM), np.std(f1_scores_SVM)))


# Step 6: Comparative Analysis

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from math import pi

# Step 1: Prepare the data
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
num_metrics = len(metrics)

# Calculate the average cross-validation scores for both models
rf_scores = [np.mean(accuracy_scores_rf), np.mean(precision_scores_rf), np.mean(recall_scores_rf), np.mean(f1_scores_rf)]
svm_scores = [np.mean(accuracy_scores_SVM), np.mean(precision_scores_SVM), np.mean(recall_scores_SVM), np.mean(f1_scores_SVM)]

# Step 2: Prepare the radar chart
angles = [n / float(num_metrics) * 2 * pi for n in range(num_metrics)]
angles += angles[:1]  # Complete the loop

# Append the first value to the end of each score list for radar chart closure
rf_scores += rf_scores[:1]
svm_scores += svm_scores[:1]

# Step 3: Plot the radar chart
plt.figure(figsize=(5, 5))
ax = plt.subplot(111, polar=True)

# Draw one axe per variable + add labels
plt.xticks(angles[:-1], metrics)

# Plot each model's data
ax.plot(angles, rf_scores, label="Random Forest", color='b', linewidth=2, linestyle='-')
ax.fill(angles, rf_scores, color='b', alpha=0.25)

ax.plot(angles, svm_scores, label="SVM", color='r', linewidth=2, linestyle='--')
ax.fill(angles, svm_scores, color='r', alpha=0.25)

# Step 4: Add a title and legend
plt.title("Performance Comparison of Random Forest and SVM Models", size=15, color='black', pad=20)
plt.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))

# Step 5: Add watermark
plt.text(0.5, 0.5, "Kaggle Notebook by Mohsin ALi", 
         fontsize=30, color='gray', alpha=0.3, 
         ha='center', va='center', rotation=30, 
         transform=ax.transAxes)

# Display the radar chart
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Mean and standard deviation for Random Forest (from cross-validation)
rf_means = [np.mean(accuracy_scores_rf), np.mean(precision_scores_rf), np.mean(recall_scores_rf), np.mean(f1_scores_rf)]
rf_stds = [np.std(accuracy_scores_rf), np.std(precision_scores_rf), np.std(recall_scores_rf), np.std(f1_scores_rf)]

# Mean and standard deviation for SVM (from cross-validation)
svm_means = [np.mean(accuracy_scores_SVM), np.mean(precision_scores_SVM), np.mean(recall_scores_SVM), np.mean(f1_scores_SVM)]
svm_stds = [np.std(accuracy_scores_SVM), np.std(precision_scores_SVM), np.std(recall_scores_SVM), np.std(f1_scores_SVM)]

# Labels for the metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
x = np.arange(len(metrics))  # X-axis positions for the metrics

# Bar width
width = 0.35  

# Create the plot
plt.figure(figsize=(6, 4))

# Bar for Random Forest
plt.bar(x - width/2, rf_means, width, yerr=rf_stds, capsize=5, label='Random Forest', color='skyblue', alpha=0.8)

# Bar for SVM
plt.bar(x + width/2, svm_means, width, yerr=svm_stds, capsize=5, label='SVM', color='salmon', alpha=0.8)


plt.xticks(x, metrics)
plt.ylabel('Scores')
plt.title('Comparison of Random Forest and SVM Models (5-Fold Cross-Validation)')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)


plt.text(0.5, 0.5, "Kaggle Notebook by Mohsin Ali", 
         fontsize=30, color='gray', alpha=0.3, 
         ha='center', va='center', rotation=30, 
         transform=plt.gca().transAxes)

plt.tight_layout()
plt.show()
