In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Top Podcasts Analysis") \
    .getOrCreate()

from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/DM_DCCF/top_podcasts.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)

rdd = df.rdd

# Operaciones en el RDD:
# estadística desccriptiva para duracion de los podcasts:
duration_rdd = rdd.map(lambda row: row["duration_ms"]) \
                  .filter(lambda x: x is not None) \
                  .filter(lambda x: x.isdigit()) \
                  .map(lambda x: int(x))

count = duration_rdd.count()
total_duration = duration_rdd.reduce(lambda x, y: x + y)
mean_duration = total_duration / count

min_duration = duration_rdd.min()
max_duration = duration_rdd.max()

print(f"Count: {count}")
print(f"Mean Duration: {mean_duration}")
print(f"Min Duration: {min_duration}")
print(f"Max Duration: {max_duration}")

# conteo de episodios explícitos y no-explícito
explicit_count = rdd.filter(lambda row: row["explicit"] in [True, "True", "true", 1, "1"]).count()

non_explicit_count = rdd.filter(lambda row: row["explicit"] in [False, "False", "false", 0, "0"]).count()

print(f"Explicit Episodes: {explicit_count}")
print(f"Non-Explicit Episodes: {non_explicit_count}")

# datos por región:
region_counts = rdd.map(lambda row: (row["region"], 1)).reduceByKey(lambda x, y: x + y)

# resultados
for region, count in region_counts.collect():
    print(f"Region: {region}, Episode Count: {count}")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Count: 127616
Mean Duration: 3380827.220473922
Min Duration: 6896
Max Duration: 42349156
Explicit Episodes: 19284
Non-Explicit Episodes: 108332
Region: jp, Episode Count: 6800
Region: it, Episode Count: 6800
Region: id, Episode Count: 6800
Region: gb, Episode Count: 6800
Region: es, Episode Count: 6800
Region: de, Episode Count: 6800
Region: co, Episode Count: 6800
Region: au, Episode Count: 6800
Region: ph, Episode Count: 6800
Region: nz, Episode Count: 6800
Region: mx, Episode Count: 6800
Region: ie, Episode Count: 6800
Region: cl, Episode Count: 6800
Region: br, Episode Count: 6800
Region: ar, Episode Count: 6800
Region: us, Episode Count: 6800
Region: pl, Episode Count: 6800
Region: nl, Episode Count: 6800
Region: in, Episode Count: 6800
Region: fr, Episode Count: 6800
Region: ca, Episode Count: 6800
Region: at, Episode Count: 6800


In [23]:
#filtrar episodios en MX
episodes_mx = rdd.filter(lambda row: row["region"] == "mx")
mx_count = episodes_mx.count()
print(f"Número de episodios en MX: {mx_count}")

# duración en minutos
duration_minutes_rdd = rdd.map(lambda row: int(row["duration_ms"]) / 60000 if row["duration_ms"] and row["duration_ms"].isdigit() else 0)
print(f"Duración en minutos (primeros 10 episodios): {duration_minutes_rdd.take(10)}")

# episodios en español
spanish_episodes = rdd.filter(lambda row: row["language"] == "es")
spanish_count = spanish_episodes.count()
print(f"Número de episodios en español: {spanish_count}")

# agrupar episodios por región
episodes_by_region = rdd.groupBy(lambda row: row["region"])



Número de episodios en MX: 6800
Duración en minutos (primeros 10 episodios): [164.53508333333335, 187.50575, 180.667, 17.889766666666667, 148.2832, 18.2766, 0, 69.09343333333334, 9.2718, 56.808883333333334]
Número de episodios en español: 18485
