In [0]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import sum as _sum

# Initialize the Spark session
spark = SparkSession.builder.appName('Project').getOrCreate()

# Creating a function for reading/extracting data
def extracting():
    df = spark.read.csv('/FileStore/tables/GlobalLandTemperaturesByCity.csv', inferSchema=True, header=True)
    return df

# Creating a function for transforming data
def transform(df):
    # Display the null counts for each column
    for c in df.columns:
        null_count = df.select(_sum(col(c).isNull().cast("int")).alias(c))
        null_count.show()

    # Group by 'Country' and calculate the average temperature
    df_avg_time = df.groupBy('Country').agg(avg('AverageTemperature').alias('AvgTemperature'))

    # Add a new column for temperature categories
    df_avg_time = df_avg_time.withColumn(
        'TemperatureCategory',
        when(col('AvgTemperature') > 20, 'Warm')
        .when(col('AvgTemperature') > 10, 'Mild')
        .otherwise('Cold')
    )
    df_avg_time.show()

    # Drop rows with null values in the original DataFrame
    df_cleaned = df.dropna()
    df_cleaned.show()

    return df_avg_time, df_cleaned

# Creating a function to extract/save the transformed data
def extracting_transformed_data(df_avg_time, df_cleaned):
    # Save the average temperature data with categories to a CSV file
    df_avg_time.write.csv('/FileStore/tables/TransformedData', header=True, mode='overwrite')

    # Save the cleaned DataFrame to another CSV file
    df_cleaned.write.csv('/FileStore/tables/CleanedData', header=True, mode='overwrite')

    print("Data saved successfully!")

# Call the functions
df = extracting()
df_avg_time, df_cleaned = transform(df)
extracting_transformed_data(df_avg_time, df_cleaned)


+---+
| dt|
+---+
|  0|
+---+

+------------------+
|AverageTemperature|
+------------------+
|            364130|
+------------------+

+-----------------------------+
|AverageTemperatureUncertainty|
+-----------------------------+
|                       364130|
+-----------------------------+

+----+
|City|
+----+
|   0|
+----+

+-------+
|Country|
+-------+
|      0|
+-------+

+--------+
|Latitude|
+--------+
|       0|
+--------+

+---------+
|Longitude|
+---------+
|        0|
+---------+

+-----------------+------------------+-------------------+
|          Country|    AvgTemperature|TemperatureCategory|
+-----------------+------------------+-------------------+
|           Russia|3.3472679828735536|               Cold|
|         Paraguay|22.784014312977117|               Warm|
|            Yemen| 25.76840766445382|               Warm|
|            Burma|26.016839989290098|               Warm|
|      Philippines| 26.51646246746498|               Warm|
|          Eritrea|24.0015