# Data Cleaning with PySpark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, regexp_replace, monotonically_increasing_id


spark = SparkSession.builder.appName("Data_cleaning").getOrCreate()

## 1. Data Reading and Loading:

In [None]:
df = spark.read.csv("population.csv", header=True, inferSchema=True, sep=";")

In [None]:
df.take(3)

In [None]:
df.show(3)

In [None]:
df.collect()

## 2. Data Description

In [None]:
df.printSchema()

In [None]:
df.columns

In [None]:
df.schema.names

In [None]:
df.describe()

In [None]:
df.describe().show()

In [None]:
df.count()

In [None]:
len(df.columns)

## 3. Handling Missing Data:

In [None]:
df.filter(col("with_migration_background").isNotNull()).count()

In [None]:
df.filter(col("with_migration_background").isNull()).count()

In [None]:
df.columns

In [None]:
for i in df.columns:
    print(f"{i} : {df.filter(col(i).isNull()).count()}")

In [None]:
df = df.fillna("deleting")
df.show(3)

## 4. Data Filtering and Cleaning:

In [None]:
df.filter(col("total") == "deleting").show()

In [None]:
df.drop(col("total") == "deleting")

In [None]:
df.filter(col("total") == "deleting").count()

In [None]:
df.filter(col("with_migration_background") == "deleting").count()

In [None]:
df.count()

In [None]:
df = df.withColumn("index", monotonically_increasing_id())
df.show(3)

In [None]:
df = df.select("index", *df.columns)
df.show(3)

In [None]:
df.filter((df.index==1) | (df.index==2)).show()

In [None]:
# df = df.drop(col("index"))

In [None]:
df.show(3)

## 5. Data Manipulation:

In [None]:
df_postcode = df.filter(col("index") <= 4).withColumn("postcode", regexp_replace(col("postcode"), '[^a-zA-Z.]+', ""))
df_postcode.show()

In [None]:
df_postcode_last = df.filter(col("index") > 4).withColumn("postcode", regexp_replace(col("postcode"), '[^0-9.]+', ""))
df_postcode_last.show()

In [None]:
df = df.drop(col("index") <= 4)
df.show()

In [None]:
df.show()

In [None]:
df.filter(col("index") > 4).show()

In [None]:
df = df.filter(col("index") > 4).withColumn("postcode", regexp_replace(col("postcode"), '[^0-9.]+', ""))
df.show()

In [None]:


df = df.filter(col("index") > 4).withColumn(column, regexp_replace(col(column), '[^0-9.]+', ""))

    

In [None]:
df.show()

In [None]:
df.filter(col("index") <= 4).show()

In [None]:
for col in df.columns:
    df = df.withColumn(col, regexp_replace(col, '[^0-9a-zA-Z.]+', ''))

In [None]:
df.show()

In [None]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()