In [0]:
# Load sample IT carbon emission dataset
df = spark.read.format("csv").option("header", "true").load("/Volumes/carbonfootprint/default/carbonfootprint/datacentresworldwide.csv")

In [0]:
display(df)

In [0]:
df.columns

In [0]:
df.describe()

In [0]:
# Remove missing values
df_dropped_any = df.na.drop(how="any")

In [0]:
# Data cleaning and type casting
from pyspark.sql.functions import col
df_clean = df.withColumn("_c5", col("_c5").cast("string"))\
.withColumn("_c7", col("_c7").cast("string"))\
.withColumn("Number of data center", col("Number of data center").cast("string"))   

        

In [0]:
# Real-time aggregation - Carbon footprint per country

from pyspark.sql.functions import col, expr

df_emission_bycountry = (
    df_clean.withColumn(
        "_c5",
        expr("try_cast(`_c5` as double)")
    ).withColumn(
        "_c7",
        expr("try_cast(`_c7` as double)")
    ).groupBy(
        col("Country"),
        col("Region")
    ).agg(
        {"_c5": "sum", "_c7": "sum"}
    ).withColumnRenamed(
        "sum(_c5)",
        "gross_carbon_emission"
    ).withColumnRenamed(
        "sum(_c7)",
        "available_gross_carbon_emission"
    )
)
display(df_emission_bycountry)

In [0]:
# Emission by Region with high number of Datacenters
from pyspark.sql.functions import desc
max_dc = df.orderBy(desc("Number of data centre")).limit(10)
max_dc.show()

In [0]:
# Identify top emitting Countries for carbon optimisation
top_emitters = df_emission_bycountry.orderBy(
    col("gross_carbon_emission").desc()
).limit(10)
display(top_emitters)

In [0]:
# Visualize carbon emission trends over region
import matplotlib.pyplot as plt

pandas_df = df_emission_bycountry.toPandas()
plt.figure(figsize=(10,6))
for country in pandas_df['Region'].unique():
    country_data = pandas_df[pandas_df['Region'] == country]
    plt.plot(
        country_data['Region'],
        country_data['gross_carbon_emission'],
        label=country
    )
plt.xlabel('Region')
plt.ylabel('Gross Emission')
plt.title('Carbon Emission per Region')
plt.legend()
plt.tight_layout()
display(plt)

In [0]:
# Visualise Box plot with available carbon emission for Datacenters by Region
df_for_plot = df.select("Region","Number of data centre","_c7").toPandas()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,6))
sns.boxplot(x="Region", y="_c7", data=df_for_plot)
plt.title("Available Carbon emission by Region")
plt.xticks(rotation=90)
plt.xlabel("Region")
plt.ylabel("Carbon emission")
plt.show()