*Title: Analysing Tropospheric Formaldehyde (HCHO) gas in Sri Lanka*

*Author: Benura Wickramanayake 💻*

*Date: 2024-03-25*

In [2]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

### PySpark Environment

In [3]:
# create a Spark session
spark = SparkSession.builder.appName("DataPreprocessing").getOrCreate()

## Create Dataframe

In [4]:
# Define the schema
schema = StructType([
    StructField("HCHO_reading", DoubleType(), True),
    StructField("Location", StringType(), True),
    StructField("Current_Date", StringType(), True),
    StructField("Next_Date", StringType(), True)
])

In [7]:
# load data
file_paths = [
    "Dataset\\col_mat_nuw_output.csv",
    "Dataset\\mon_kur_jaf_output.csv",
    "Dataset\\kan_output.csv"
]

In [8]:
# dataframe creation
df = spark.read.csv(file_paths, schema=schema, header=True)

Defined schema of the dataset with data types for each column, load 3 csv files (dataset) into DataFrame  

### Dataset Observations

In [9]:
# df summary
df_summary = df.describe()
df_summary.show()

+-------+--------------------+-------------------+------------+----------+
|summary|        HCHO_reading|           Location|Current_Date| Next_Date|
+-------+--------------------+-------------------+------------+----------+
|  count|                7916|              12779|       12779|     12779|
|   mean|1.192609008729181...|               NULL|        NULL|      NULL|
| stddev|9.322891110805865E-5|               NULL|        NULL|      NULL|
|    min|-3.52473024357239...| Bibile, Monaragala|  2019-01-01|2019-01-02|
|    max|8.997101837438971E-4|Nuwara Eliya Proper|  2023-12-31|2024-01-01|
+-------+--------------------+-------------------+------------+----------+



In [11]:
# see first 50 rows
df.show(50, truncate=False)   #truncate - readability

+----------------------+------------------+------------+----------+
|HCHO_reading          |Location          |Current_Date|Next_Date |
+----------------------+------------------+------------+----------+
|1.919914652467399E-5  |Bibile, Monaragala|2019-01-02  |2019-01-03|
|2.8114479359302837E-5 |Bibile, Monaragala|2019-01-03  |2019-01-04|
|3.747998184385943E-5  |Bibile, Monaragala|2019-01-04  |2019-01-05|
|-1.7982608793453114E-5|Bibile, Monaragala|2019-01-05  |2019-01-06|
|1.4578368961799026E-4 |Bibile, Monaragala|2019-01-06  |2019-01-07|
|2.8285908025465342E-5 |Bibile, Monaragala|2019-01-07  |2019-01-08|
|NULL                  |Bibile, Monaragala|2019-01-08  |2019-01-09|
|1.4208501670509577E-4 |Bibile, Monaragala|2019-01-09  |2019-01-10|
|NULL                  |Bibile, Monaragala|2019-01-10  |2019-01-11|
|2.014587947072581E-5  |Bibile, Monaragala|2019-01-11  |2019-01-12|
|1.5827876632101837E-4 |Bibile, Monaragala|2019-01-12  |2019-01-13|
|NULL                  |Bibile, Monaragala|2019-

# PreProcessing

## 