In [8]:
# Instalar Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Descargar e instalar Spark (enlace actualizado)
!wget -q https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz

# Instalar pyspark
!pip install -q pyspark

import os

# Configurar las variables de entorno
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

from pyspark.sql import SparkSession

# Iniciar sesión de Spark
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Verificar que Spark está funcionando
spark

# Verificar la versión de Spark
spark.version

# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/DM_DCCF/top_podcasts.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)
#filtrar episodios explícitos
df_filtered = df.filter(df['explicit'] == True)
df_filtered.show()

#filtrar por región MX
df_filtered = df.filter(df['region'] == 'mx')
df_filtered.show()

#filtrar episodios del último año
df_filtered = df.filter(df['release_date'] > '2024-01-01')
df_filtered.show()

#Contar cuántos episodios tienen un rank menor a 100
df.filter(df['rank'] < 100).count()

#Crear una columna que sea True si la duración del episodio es mayor a 30 minutos, y False de lo contrario:
from pyspark.sql import functions as F
df = df.withColumn('long_episode', F.when(df['duration_ms'] > 1800000, True).otherwise(False))
df.show()


#Promedio y valor máximo de rank por región
df.groupBy('region').agg(F.mean('rank').alias('mean_rank'),
                         F.max('rank').alias('max_rank')).show()

#distribución de frecuencias por categoría
df.groupBy('region').count().orderBy('count', ascending=False).show()




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+----------+----+------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+--------------------+-----------+--------+---------+------------+----------------------+---------------+-------------+--------------------+---------------------+-------------------------+--------------+---------------+-------------------+---------+--------------------+
|      date|rank|region|chartRankMove|          episodeUri|             showUri|         episodeName|         description|           show.name|    show.description|      show.publisher|duration_ms|explicit|is_externally_hosted|is_playable|language|languages|release_date|release_date_precision|show.copyrights|show.explicit|           show.href|show.html_description|show.is_externally_hosted|sh