In [1]:
#EXPLORACION - SMS FACTURACION
#Este notebook solo contiene un breve analisis exploratorio de datos.

In [2]:
#Confirguraci√≥n del entorno

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SMS Facturacion - Exploracion") \
    .getOrCreate()

In [4]:
#Carga los datasets

path_eventos = "/home/jovyan/data/events.csv.gz"
path_free_sms = "/home/jovyan/data/free_sms_destinations.csv.gz"

eventos_df = spark.read.csv(path_eventos, header=True, inferSchema=True)
free_sms_df = spark.read.csv(path_free_sms, header=True, inferSchema=True)

In [6]:
#Solo para ver como estan estructurados
eventos_df.show(5)
free_sms_df.show(5)

+----+-----+-------+---+--------+------+---------+--------------+
|hour|calls|seconds|sms|    date|region|id_source|id_destination|
+----+-----+-------+---+--------+------+---------+--------------+
|  11|    1|     24|  0|20211001|     5|      BF3|           374|
|   1|    1|     51|  0|20211001|     4|      9F5|           374|
|  11|    1|      3|  0|20211001|     6|      025|           374|
|  10|    1|     36|  0|20211001|     5|      FB6|           D52|
|  23|    4|    137|  0|20211001|     8|      4BB|           861|
+----+-----+-------+---+--------+------+---------+--------------+
only showing top 5 rows

+---+
| id|
+---+
|374|
|D52|
|861|
|5B0|
|4CA|
+---+
only showing top 5 rows



In [7]:
#Solo para ver el esquema de columnas
print("Esquema de eventos_df:")
eventos_df.printSchema()

print("Esquema de free_sms_df:")
free_sms_df.printSchema()

Esquema de eventos_df:
root
 |-- hour: integer (nullable = true)
 |-- calls: integer (nullable = true)
 |-- seconds: integer (nullable = true)
 |-- sms: integer (nullable = true)
 |-- date: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- id_source: string (nullable = true)
 |-- id_destination: string (nullable = true)

Esquema de free_sms_df:
root
 |-- id: string (nullable = true)



In [8]:
#Ver si hay nulos y cuentas por columa

from pyspark.sql.functions import col, count, when, isnan

eventos_df.select([
    count(when(col(c).isNull() | isnan(c), c)).alias(c)
    for c in eventos_df.columns
]).show()

+----+-----+-------+---+----+------+---------+--------------+
|hour|calls|seconds|sms|date|region|id_source|id_destination|
+----+-----+-------+---+----+------+---------+--------------+
|   0|    0|      0|  0|   0|     0|       18|            15|
+----+-----+-------+---+----+------+---------+--------------+



In [9]:
eventos_df.filter(
    (eventos_df["id_source"].isNull()) | (eventos_df["id_destination"].isNull())
).show(truncate=False)

+----+-----+-------+---+--------+------+---------+--------------+
|hour|calls|seconds|sms|date    |region|id_source|id_destination|
+----+-----+-------+---+--------+------+---------+--------------+
|18  |1    |1299   |0  |20211001|9     |250      |NULL          |
|19  |1    |52     |0  |20211001|7     |NULL     |392           |
|16  |2    |75     |0  |20211001|7     |CC5      |NULL          |
|12  |1    |320    |0  |20211001|7     |NULL     |6E7           |
|15  |1    |612    |0  |20211001|7     |3E4      |NULL          |
|19  |1    |63     |0  |20211001|7     |NULL     |E21           |
|20  |1    |116    |0  |20211001|7     |A84      |NULL          |
|12  |1    |59     |0  |20211001|7     |NULL     |6DE           |
|19  |1    |50     |0  |20211001|2     |8DE      |NULL          |
|18  |2    |146    |0  |20211001|9     |NULL     |NULL          |
|13  |1    |23     |0  |20211001|4     |F06      |NULL          |
|10  |1    |12     |0  |20211001|4     |NULL     |FC5           |
|14  |1   