In [1]:
# -------------------------------------------
# Importar librerías necesarias
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count, desc
from pyspark.sql.types import DoubleType, FloatType, IntegerType, LongType
import pyspark.sql.functions as F

# -------------------------------------------
# Crear sesión de Spark 
try:
    spark = SparkSession.builder.getOrCreate()
except:
    spark = SparkSession.builder \
        .appName("Exploracion_Completa_Dataset") \
        .config("spark.sql.shuffle.partitions", "100") \
        .config("spark.executor.memory", "4g") \
        .config("spark.driver.memory", "4g") \
        .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

# -------------------------------------------
# Cargar el dataset
nombre_archivo = "Books_data.csv"  
df = spark.read.csv(nombre_archivo, header=True, inferSchema=True)

# Contar registros totales
num_total = df.count()

# -------------------------------------------
# Estadísticas Generales del Dataset

# Número de columnas
num_columnas = len(df.columns)
print(f"Número de columnas: {num_columnas}")

# Número de registros
print(f"Número de registros: {num_total}")

# Tipos de datos de cada columna
print("\nTipos de datos de cada columna:")
df.printSchema()

# Registros con al menos un valor faltante
condiciones_nulos = [col(c).isNull() | (col(c) == "") for c in df.columns]
df_nulos = df.withColumn("nulos", sum([when(cond, 1).otherwise(0) for cond in condiciones_nulos]))
total_registros_con_nulos = df_nulos.filter(col("nulos") > 0).count()
print(f"\nNúmero de registros con al menos un valor faltante: {total_registros_con_nulos}")

# Columnas con valores faltantes
# Columnas numéricas
numeric_columns = [field.name for field in df.schema.fields if isinstance(field.dataType, (DoubleType, FloatType))]

# Conteo de nulos
expressions = []
for c in df.columns:
    if c in numeric_columns:
        expressions.append(count(when(col(c).isNull() | isnan(c), c)).alias(c))
    else:
        expressions.append(count(when(col(c).isNull() | (col(c) == ""), c)).alias(c))

missing_counts = df.select(expressions).collect()[0].asDict()

# Filtrar columnas con nulos
columnas_con_faltantes = [columna for columna, nulos in missing_counts.items() if nulos > 0]

print("\nColumnas con valores faltantes:")
for columna in columnas_con_faltantes:
    print(f"- {columna}")

# -------------------------------------------
# Análisis detallado por columna
def analizar_columna(df, columna, num_total):
    print(f"\n--- Columna: {columna} ---")
    col_data = df.select(columna)
    
    # Valores nulos
    if col_data.schema.fields[0].dataType in [DoubleType(), FloatType()]:
        null_count = col_data.filter(col(columna).isNull() | isnan(col(columna))).count()
    else:
        null_count = col_data.filter(col(columna).isNull() | (col(columna) == "")).count()
    
    valid_count = num_total - null_count
    percent_valid = (valid_count / num_total) * 100
    percent_missing = (null_count / num_total) * 100
    
    # Valores únicos
    unique_count = col_data.distinct().count()
    
    # Valor más común
    most_common_row = col_data.groupBy(columna).count().orderBy(desc("count")).first()
    if most_common_row and most_common_row[0] is not None:
        most_common_value = most_common_row[0]
        most_common_percent = (most_common_row[1] / num_total) * 100
    else:
        most_common_value = None
        most_common_percent = 0
    
    # Mostrar resultados básicos
    print(f"Valid: {valid_count:,} ({percent_valid:.2f}%)")
    print(f"Missing: {null_count:,} ({percent_missing:.2f}%)")
    print(f"Unique values: {unique_count:,}")
    print(f"Most Common: {most_common_value} ({most_common_percent:.2f}%)")
    print(f"Mismatched: 0 (0%)")  # No hacemos validaciones externas

    # Estadísticas para valores numericos 
    tipo = col_data.schema.fields[0].dataType
    if isinstance(tipo, (DoubleType, FloatType, IntegerType, LongType)):
        stats = col_data.describe().toPandas().set_index('summary')
        print("\nEstadísticas numéricas:")
        for metric in ["mean", "stddev", "min", "max"]:
            value = stats.loc[metric, columna]
            print(f"{metric.capitalize()}: {value}")

        # Quartiles 
        approx = col_data.approxQuantile(columna, [0.25, 0.5, 0.75], 0.01)
        print(f"25%: {approx[0]}")
        print(f"50% (Median): {approx[1]}")
        print(f"75%: {approx[2]}")

# -------------------------------------------
# Aplicar análisis a todas las columnas
for col_name in df.columns:
    analizar_columna(df, col_name, num_total)



25/04/27 19:57:37 WARN Utils: Your hostname, MacBook-Pro-de-Juan.local resolves to a loopback address: 127.0.0.1; using 192.168.100.16 instead (on interface en0)
25/04/27 19:57:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/27 19:57:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

Número de columnas: 10
Número de registros: 212404

Tipos de datos de cada columna:
root
 |-- Title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- image: string (nullable = true)
 |-- previewLink: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- publishedDate: string (nullable = true)
 |-- infoLink: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- ratingsCount: string (nullable = true)



                                                                                


Número de registros con al menos un valor faltante: 158580

Columnas con valores faltantes:
- Title
- description
- authors
- image
- previewLink
- publisher
- publishedDate
- infoLink
- categories
- ratingsCount

--- Columna: Title ---


                                                                                

Valid: 212,403 (100.00%)
Missing: 1 (0.00%)
Unique values: 212,400
Most Common: """Mom (0.00%)
Mismatched: 0 (0%)

--- Columna: description ---


                                                                                

Valid: 144,047 (67.82%)
Missing: 68,357 (32.18%)
Unique values: 133,257
Most Common: None (0.00%)
Mismatched: 0 (0%)

--- Columna: authors ---
Valid: 181,153 (85.29%)
Missing: 31,251 (14.71%)
Unique values: 133,019
Most Common: None (0.00%)
Mismatched: 0 (0%)

--- Columna: image ---
Valid: 161,213 (75.90%)
Missing: 51,191 (24.10%)
Unique values: 149,421
Most Common: None (0.00%)
Mismatched: 0 (0%)

--- Columna: previewLink ---


                                                                                

Valid: 188,349 (88.67%)
Missing: 24,055 (11.33%)
Unique values: 186,014
Most Common: None (0.00%)
Mismatched: 0 (0%)

--- Columna: publisher ---
Valid: 139,274 (65.57%)
Missing: 73,130 (34.43%)
Unique values: 34,265
Most Common: None (0.00%)
Mismatched: 0 (0%)

--- Columna: publishedDate ---
Valid: 186,560 (87.83%)
Missing: 25,844 (12.17%)
Unique values: 28,948
Most Common: None (0.00%)
Mismatched: 0 (0%)

--- Columna: infoLink ---


                                                                                

Valid: 188,103 (88.56%)
Missing: 24,301 (11.44%)
Unique values: 180,644
Most Common: None (0.00%)
Mismatched: 0 (0%)

--- Columna: categories ---


                                                                                

Valid: 171,880 (80.92%)
Missing: 40,524 (19.08%)
Unique values: 28,362
Most Common: None (0.00%)
Mismatched: 0 (0%)

--- Columna: ratingsCount ---
Valid: 63,852 (30.06%)
Missing: 148,552 (69.94%)
Unique values: 16,102
Most Common: None (0.00%)
Mismatched: 0 (0%)
