<a href="https://colab.research.google.com/github/DinarSalpaAulia44/Data-Cleaning/blob/main/Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [61]:
pip install pyspark



In [62]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("data_transaksi.csv").getOrCreate()

In [63]:
df = spark.read.csv('data_transaksi.csv', header=True, inferSchema=True)
df.show(10)

+-------+--------------------+-----+----------+-----------------+----------+---------------+
|   Nama|               Email| Umur|Pendapatan|Tanggal Pembelian|    Produk|Nilai Transaksi|
+-------+--------------------+-----+----------+-----------------+----------+---------------+
|Anthony|   dawnday@gmail.com| 24.0|   3553265|       01/13/1974|    Laptop|        2736049|
| Robert|    pbrown@gmail.com| NULL|   5525753|       03-05-2004|    Tablet|        4996241|
|Melissa| ethan99@clayton.com| NULL|   4911936|       28-04-1970|  Handpone|        3169456|
|Timothy|johnsonmegan@gmai...|200.0|3140485039|       03/09/2003|    Laptop|         506622|
|  Kelly|keithwebster(at)e...|200.0|3833611071|       17-01-2022|   Tablett|        9969204|
|  Jared| qwiggins@hodges.com| 69.0|   8024480|       02-01-2005|   Tablett|        3093351|
| Nicole|ogonzalez@hotmail...|200.0|4286346096|       08-01-2024| Handphone|        3007158|
|  Mason|lisasmith@hotmail...| NULL|   9858877|       10/12/1994|Smart

In [64]:
df.printSchema()
df.describe().show()
df.columns

root
 |-- Nama: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Umur: double (nullable = true)
 |-- Pendapatan: long (nullable = true)
 |-- Tanggal Pembelian: string (nullable = true)
 |-- Produk: string (nullable = true)
 |-- Nilai Transaksi: integer (nullable = true)

+-------+------+-------------------+------------------+--------------------+-----------------+---------+------------------+
|summary|  Nama|              Email|              Umur|          Pendapatan|Tanggal Pembelian|   Produk|   Nilai Transaksi|
+-------+------+-------------------+------------------+--------------------+-----------------+---------+------------------+
|  count|   100|                100|                68|                 100|              100|      100|               100|
|   mean|  NULL|               NULL|111.86764705882354|     1.65132351882E9|             NULL|     NULL|        5556026.09|
| stddev|  NULL|               NULL| 77.42110175763524|1.8103144429023266E9|             N

['Nama',
 'Email',
 'Umur',
 'Pendapatan',
 'Tanggal Pembelian',
 'Produk',
 'Nilai Transaksi']

In [65]:
df_clean = df.dropDuplicates()

In [66]:
df_clean = df_clean.na.drop()

In [67]:
df_filled = df.fillna({'Nama': 'Unknown', 'Email': 'Null', 'Umur': 0, 'Pendapatan': 0, 'Tanggal Pembelian': 'Null', 'Produk': 'Null', 'Nilai Transaksi': 0})

In [68]:
from pyspark.sql.functions import upper
df_clean = df_clean.withColumn('Nama', upper(df_clean['Nama']))

In [69]:
from pyspark.sql.functions import col
df_clean = df_clean.withColumn('Umur',col('Umur').cast('int'))

In [70]:
from pyspark.sql.functions import date_format

df_clean = df_clean.withColumn("Tanggal Pembelian",
                               date_format("Tanggal Pembelian", "dd-MM-yyyy"))


In [71]:
df.show(10)

+-------+--------------------+-----+----------+-----------------+----------+---------------+
|   Nama|               Email| Umur|Pendapatan|Tanggal Pembelian|    Produk|Nilai Transaksi|
+-------+--------------------+-----+----------+-----------------+----------+---------------+
|Anthony|   dawnday@gmail.com| 24.0|   3553265|       01/13/1974|    Laptop|        2736049|
| Robert|    pbrown@gmail.com| NULL|   5525753|       03-05-2004|    Tablet|        4996241|
|Melissa| ethan99@clayton.com| NULL|   4911936|       28-04-1970|  Handpone|        3169456|
|Timothy|johnsonmegan@gmai...|200.0|3140485039|       03/09/2003|    Laptop|         506622|
|  Kelly|keithwebster(at)e...|200.0|3833611071|       17-01-2022|   Tablett|        9969204|
|  Jared| qwiggins@hodges.com| 69.0|   8024480|       02-01-2005|   Tablett|        3093351|
| Nicole|ogonzalez@hotmail...|200.0|4286346096|       08-01-2024| Handphone|        3007158|
|  Mason|lisasmith@hotmail...| NULL|   9858877|       10/12/1994|Smart

In [72]:
from pyspark.sql.functions import regexp_replace
df = df.withColumn("Email", regexp_replace("Email", r"\(at\)", "@"))
df = df.withColumn("Tanggal Pembelian", regexp_replace("Tanggal Pembelian", "/", "-"))

In [73]:
df = df.drop("Pendapatan", "Nilai Transaksi")

In [74]:
df.show(100)

+----------+--------------------+-----+-----------------+----------+
|      Nama|               Email| Umur|Tanggal Pembelian|    Produk|
+----------+--------------------+-----+-----------------+----------+
|   Anthony|   dawnday@gmail.com| 24.0|       01-13-1974|    Laptop|
|    Robert|    pbrown@gmail.com| NULL|       03-05-2004|    Tablet|
|   Melissa| ethan99@clayton.com| NULL|       28-04-1970|  Handpone|
|   Timothy|johnsonmegan@gmai...|200.0|       03-09-2003|    Laptop|
|     Kelly|keithwebster@emai...|200.0|       17-01-2022|   Tablett|
|     Jared| qwiggins@hodges.com| 69.0|       02-01-2005|   Tablett|
|    Nicole|ogonzalez@hotmail...|200.0|       08-01-2024| Handphone|
|     Mason|lisasmith@hotmail...| NULL|       10-12-1994|Smartwatch|
|    Pamela|megancosta@yahoo.com| 39.0|       16-08-2011|    Laptop|
|     Chloe|kelligonzalez@hot...| NULL|       10-02-2008| Smartwach|
|    Jeanne|    elewis@gmail.com|200.0|       17-06-2004| Handphone|
|      Lisa|baileyvickie@hotm...| 