In [2]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('basics').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/23 21:00:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Let's read in the data.
df = spark.read.option("header", "true").csv('Datasets/global-data-on-sustainable-energy.csv')
df2 = spark.read.option("header", "true").csv('Datasets/annual-co2-emissions-per-country.csv')

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

## 2.2 Dataset Information

In [4]:
# The show method allows you visualise DataFrames. We can see that there are two columns. 
df.show()

# You could also try this. 
df.columns

+-----------+----+---------------------------------------+---------------------------------+----------------------------------------------------+----------------------------------------------+----------------------------------------------------------------+-----------------------------------+------------------------------+---------------------------------+--------------------------------------+--------------------------------------------------+-----------------------------------------------------------+---------------------------------+----------------------------------------+-----------+--------------+--------------+--------------+--------+---------+
|     Entity|Year|Access to electricity (% of population)|Access to clean fuels for cooking|Renewable-electricity-generating-capacity-per-capita|Financial flows to developing countries (US $)|Renewable energy share in the total final energy consumption (%)|Electricity from fossil fuels (TWh)|Electricity from nuclear (TWh)|Electricity from

['Entity',
 'Year',
 'Access to electricity (% of population)',
 'Access to clean fuels for cooking',
 'Renewable-electricity-generating-capacity-per-capita',
 'Financial flows to developing countries (US $)',
 'Renewable energy share in the total final energy consumption (%)',
 'Electricity from fossil fuels (TWh)',
 'Electricity from nuclear (TWh)',
 'Electricity from renewables (TWh)',
 'Low-carbon electricity (% electricity)',
 'Primary energy consumption per capita (kWh/person)',
 'Energy intensity level of primary energy (MJ/$2017 PPP GDP)',
 'Value_co2_emissions_kt_by_country',
 'Renewables (% equivalent primary energy)',
 'gdp_growth',
 'gdp_per_capita',
 'Density(P/Km2)',
 'Land Area(Km2)',
 'Latitude',
 'Longitude']

In [5]:
num_rows = df.count()
print(f"Number of rows: {num_rows}")

num_columns = len(df.columns)
print(f"Number of columns: {num_columns}")

Number of rows: 3649
Number of columns: 21


In [6]:
# For type, we can use print schema. 
# But wait! What if you want to change the format of the data? Maybe change age to an integer instead of long?
df.printSchema()

root
 |-- Entity: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Access to electricity (% of population): string (nullable = true)
 |-- Access to clean fuels for cooking: string (nullable = true)
 |-- Renewable-electricity-generating-capacity-per-capita: string (nullable = true)
 |-- Financial flows to developing countries (US $): string (nullable = true)
 |-- Renewable energy share in the total final energy consumption (%): string (nullable = true)
 |-- Electricity from fossil fuels (TWh): string (nullable = true)
 |-- Electricity from nuclear (TWh): string (nullable = true)
 |-- Electricity from renewables (TWh): string (nullable = true)
 |-- Low-carbon electricity (% electricity): string (nullable = true)
 |-- Primary energy consumption per capita (kWh/person): string (nullable = true)
 |-- Energy intensity level of primary energy (MJ/$2017 PPP GDP): string (nullable = true)
 |-- Value_co2_emissions_kt_by_country: string (nullable = true)
 |-- Renewables (% equiva

In [7]:
# We can use the describe method get some general statistics on our data too. Remember to show the DataFrame!
# But what about data type?
# df.describe().show()

stats_df = df.describe()
# Convert Spark DataFrame to Pandas DataFrame
stats_pd = stats_df.toPandas()

# Transpose the Pandas DataFrame
transposed_stats_pd = stats_pd.transpose()

from IPython.display import display

# Display the transposed DataFrame
display(transposed_stats_pd)


                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Entity,3649,,,Afghanistan,Zimbabwe
Year,3649,2010.0383666758016,6.054228365650764,2000,2020
Access to electricity (% of population),3639,78.93370216207742,30.275541433748966,1.2522693,99.99887
Access to clean fuels for cooking,3480,63.255287356321816,39.0436575747337,0,99.9
Renewable-electricity-generating-capacity-per-capita,2718,113.13749816041195,244.16725566287886,0,99.89
Financial flows to developing countries (US $),1560,9.4224E7,2.981544065856051E8,0,99900000
Renewable energy share in the total final energy consumption (%),3455,32.63816497829229,29.89490136284931,0,96.04
Electricity from fossil fuels (TWh),3628,70.3650027563395,348.0518661301112,0,99.83
Electricity from nuclear (TWh),3523,13.450190178824874,73.00662309724122,0,99.46


In [12]:
# Count unique values for a specific column
column_name = "Entity"
unique_count = df.select(column_name).distinct().count()
print(f"Number of unique countries in dataset '{column_name}': {unique_count}")

Number of unique countries in dataset 'Entity': 176


In [8]:
# The show method allows you visualise DataFrames. We can see that there are two columns. 
df2.show()

# You could also try this. 
df2.columns

+-----------+----+----+--------------------+
|     Entity|Code|Year|Annual CO₂ emissions|
+-----------+----+----+--------------------+
|Afghanistan| AFG|1949|               14656|
|Afghanistan| AFG|1950|               84272|
|Afghanistan| AFG|1951|               91600|
|Afghanistan| AFG|1952|               91600|
|Afghanistan| AFG|1953|              106256|
|Afghanistan| AFG|1954|              106256|
|Afghanistan| AFG|1955|              153888|
|Afghanistan| AFG|1956|              183200|
|Afghanistan| AFG|1957|              293120|
|Afghanistan| AFG|1958|              329760|
|Afghanistan| AFG|1959|              384571|
|Afghanistan| AFG|1960|              413885|
|Afghanistan| AFG|1961|              490798|
|Afghanistan| AFG|1962|              688594|
|Afghanistan| AFG|1963|              706736|
|Afghanistan| AFG|1964|              838551|
|Afghanistan| AFG|1965|             1006917|
|Afghanistan| AFG|1966|             1091159|
|Afghanistan| AFG|1967|             1281865|
|Afghanist

['Entity', 'Code', 'Year', 'Annual CO₂ emissions']

In [9]:
num_rows = df2.count()
print(f"Number of rows: {num_rows}")

num_columns = len(df2.columns)
print(f"Number of columns: {num_columns}")

Number of rows: 30308
Number of columns: 4


In [10]:
# For type, we can use print schema. 
# But wait! What if you want to change the format of the data? Maybe change age to an integer instead of long?
df2.printSchema()

root
 |-- Entity: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Annual CO₂ emissions: string (nullable = true)



In [11]:
# We can use the describe method get some general statistics on our data too. Remember to show the DataFrame!
# But what about data type?
df2.describe().show()

[Stage 13:>                                                         (0 + 1) / 1]

+-------+-----------+-----+------------------+--------------------+
|summary|     Entity| Code|              Year|Annual CO₂ emissions|
+-------+-----------+-----+------------------+--------------------+
|  count|      30308|24157|             30308|               30308|
|   mean|       null| null|1940.1913356209582| 3.912721561273364E8|
| stddev|       null| null| 65.51023156216606|1.8558246650416756E9|
|    min|Afghanistan|  ABW|              1750|                   0|
|    max|   Zimbabwe|  ZWE|              2022|              999975|
+-------+-----------+-----+------------------+--------------------+



                                                                                

## 2.3 Data Exploration