In [None]:
from pyspark.sql.functions import *
from pyspark.sql.functions import sum as _sum
from pyspark.sql import SparkSession
from functools import reduce

### CLEANING FUNCTIONS

In [None]:
''' This function takes as input the hdfs path of a csv and converts it into a spark dataframe '''
def csv_to_sparkdf(csv_path):
    # Create Spark Session
    spark_session = (
    SparkSession.builder.appName("SparkProfiling")
    .master("local[*]")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
    )
    # Load the dataset in a spark dataframe
    spark_df = spark_session.read.csv(csv_path, header=True, inferSchema=True)
    return spark_df

In [None]:
''' This function takes a spark dataframe as input and prints statistics on null values and duplicate rows '''
def statistics(spark_df):
    # Count null values for each columns
    null_counts = spark_df.select([count(when(col(c).isNull(), c)).alias(c) for c in spark_df.columns])
    # Count duplicates rows
    duplicate_rows = spark_df.groupBy(spark_df.columns).count().filter("count > 1")
    duplicate_count = duplicate_rows.count()

    null_counts.show()
    print(f"Number of duplicates rows: {duplicate_count}")

    # Show duplicate rows
    if duplicate_count > 0:
        print("Duplicate rows:")
        duplicate_rows.show(truncate=False) 

In [None]:
''' This function takes a spark dataframe as input and returns a 'clean' spark dataframe i.e. without
    duplicate rows and rows with too many null values '''
def cleaning(spark_df):
    # Initial number of rows
    initial_row_count = spark_df.count()
    print(f"Initial number of rows: {initial_row_count}")

    # Remove duplicate rows
    new_spark_df = spark_df.dropDuplicates()
    row_count_withoutDuplicates = new_spark_df.count()
    print(f"Remove duplicate rows: {initial_row_count - row_count_withoutDuplicates}")


    # Calculate the threshold of nulls for each row 
    # To delete even more lines, simply lower the threshold
    threshold = len(new_spark_df.columns) / 2

    # Create a temporary column to count null values per row
    null_counts_expr = reduce(
        lambda acc, c: acc + when(col(c).isNull(), 1).otherwise(0),
        new_spark_df.columns,
        lit(0)
    )
    spark_df_null = new_spark_df.withColumn("null_count", null_counts_expr)

    # Keep rows with less than the threshold (50%) of null values
    filtered_df = spark_df_null.filter(col("null_count") < threshold).drop("null_count")
    print(f"Number of rows with null values removed: {row_count_withoutDuplicates - filtered_df.count()}")


    # Final number of rows
    final_row_count = filtered_df.count()
    print(f"Final number of rows: {final_row_count}")

    # Number of removed rows
    deleted_row_count = initial_row_count - final_row_count
    print(f"Number of removed rows: {deleted_row_count}")

    filtered_df.show(10)
    return filtered_df

In [None]:
''' This function takes a spark dataframe as input and stores it in a csv on hdfs '''
def sparkdf_to_csv(spark_df):
    # Hdfs path where you want to save the CSV file
    output_path = "/output/cleaned_datasets/"
    spark_df = spark_df.coalesce(1)
    spark_df.write.csv(output_path, header=True)

### CLEANING AIRLINE DATASET

In [None]:
spark_df = csv_to_sparkdf("/input/Airline.csv")

In [None]:
statistics(spark_df)

### CLEANING BOOKS RATING DATASET

In [None]:
spark_df = csv_to_sparkdf("/input/Books_rating.csv")

In [None]:
statistics(spark_df)

In [None]:
filtered_df = cleaning(spark_df)

In [None]:
sparkdf_to_csv(filtered_df)

### CLEANING LONDON DATASET

In [None]:
spark_df = csv_to_sparkdf("/input/london.csv")

In [None]:
statistics(spark_df)

In [None]:
filtered_df = cleaning(spark_df)

In [None]:
sparkdf_to_csv(filtered_df)

### CLEANING TITLES DATASET

In [None]:
spark_df = csv_to_sparkdf("/input/dirty_titles.csv")

In [None]:
statistics(spark_df)

In [None]:
filtered_df = cleaning(spark_df)

In [None]:
sparkdf_to_csv(filtered_df)

### CLEANING PRODUCTS MARKET NOV 2019 DATASET

In [None]:
spark_df = csv_to_sparkdf("/input/2019-Nov.csv")

In [None]:
statistics(spark_df)

In [None]:
filtered_df = cleaning(spark_df)

In [None]:
sparkdf_to_csv(filtered_df)