In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, broadcast, when
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType, StringType, TimestampType

In [2]:
spark = SparkSession.builder \
    .appName("data_processing") \
    .config("spark.some.config.option", "config-value") \
    .getOrCreate()

In [3]:
df = spark.read.parquet("/home/jovyan/work/data/preprocessed/")
df.show()

+----+-----+-------+---+------+---------+--------------+----+--------+
|hour|calls|seconds|sms|region|id_source|id_destination|  id|    date|
+----+-----+-------+---+------+---------+--------------+----+--------+
|  22|    1|     75|  0|     9|      731|           7F9|NULL|20211001|
|  23|    3|    453|  0|     9|      731|           7F9|NULL|20211001|
|  20|    1|    172|  0|     9|      731|           7F9|NULL|20211001|
|  15|    2|    151|  0|     9|      731|           7F9|NULL|20211001|
|  16|    1|    338|  0|     9|      731|           7F9|NULL|20211001|
|  18|    1|     78|  0|     9|      FCE|           356|NULL|20211001|
|  13|    1|     41|  0|     9|      0D4|           356|NULL|20211001|
|  11|    1|     75|  0|     9|      449|           356|NULL|20211001|
|  14|    2|     62|  0|     8|      7A5|           17A|NULL|20211001|
|   9|    1|    128|  0|     8|      557|           17A|NULL|20211001|
|  11|    2|     73|  0|     8|      4DF|           67C|NULL|20211001|
|  15|

In [4]:
df = df.withColumn("cuota_sms_region",
                   when(df["id"].isNotNull(), 0)
                   .when((df["region"] >= 1) & (df["region"] <= 5), 1.5)
                   .when((df["region"] >= 6) & (df["region"] <= 9), 2)
                   .otherwise(None))
df_drop_idcol = df.drop("id")
df.unpersist()

DataFrame[hour: string, calls: string, seconds: string, sms: string, region: string, id_source: string, id_destination: string, id: string, date: int, cuota_sms_region: double]

In [5]:
df_drop_idcol.show()

+----+-----+-------+---+------+---------+--------------+--------+----------------+
|hour|calls|seconds|sms|region|id_source|id_destination|    date|cuota_sms_region|
+----+-----+-------+---+------+---------+--------------+--------+----------------+
|  22|    1|     75|  0|     9|      731|           7F9|20211001|             2.0|
|  23|    3|    453|  0|     9|      731|           7F9|20211001|             2.0|
|  20|    1|    172|  0|     9|      731|           7F9|20211001|             2.0|
|  15|    2|    151|  0|     9|      731|           7F9|20211001|             2.0|
|  16|    1|    338|  0|     9|      731|           7F9|20211001|             2.0|
|  18|    1|     78|  0|     9|      FCE|           356|20211001|             2.0|
|  13|    1|     41|  0|     9|      0D4|           356|20211001|             2.0|
|  11|    1|     75|  0|     9|      449|           356|20211001|             2.0|
|  14|    2|     62|  0|     8|      7A5|           17A|20211001|             2.0|
|   

In [6]:
# cambiando tipos de dato

In [7]:
df_cast = df_drop_idcol.withColumn("hour", F.col("hour").cast(IntegerType())) \
       .withColumn("calls", F.col("calls").cast(IntegerType())) \
       .withColumn("seconds", F.col("seconds").cast(IntegerType())) \
       .withColumn("sms", F.col("sms").cast(IntegerType())) \
       .withColumn("region", F.col("region").cast(IntegerType())) \
       .withColumn("date", F.to_date(F.col("date").cast(StringType()), "yyyyMMdd")) \
       .withColumn("cuota_sms_region", F.col("cuota_sms_region").cast(FloatType()))

In [8]:
df_cast.printSchema()

root
 |-- hour: integer (nullable = true)
 |-- calls: integer (nullable = true)
 |-- seconds: integer (nullable = true)
 |-- sms: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- id_source: string (nullable = true)
 |-- id_destination: string (nullable = true)
 |-- date: date (nullable = true)
 |-- cuota_sms_region: float (nullable = true)



In [9]:
df_cast.show(3)

+----+-----+-------+---+------+---------+--------------+----------+----------------+
|hour|calls|seconds|sms|region|id_source|id_destination|      date|cuota_sms_region|
+----+-----+-------+---+------+---------+--------------+----------+----------------+
|  22|    1|     75|  0|     9|      731|           7F9|2021-10-01|             2.0|
|  23|    3|    453|  0|     9|      731|           7F9|2021-10-01|             2.0|
|  20|    1|    172|  0|     9|      731|           7F9|2021-10-01|             2.0|
+----+-----+-------+---+------+---------+--------------+----------+----------------+
only showing top 3 rows



In [10]:
df_cast.write.mode("overwrite").partitionBy("date").parquet("/home/jovyan/work/data/processed/")

In [11]:
spark.stop()