In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, min, max, when, udf, sum
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.master("local").appName("PersonasDataFrame").getOrCreate()

In [2]:
data = [("Alice", 25, "New York"),
        ("Bob", 30, "Los Angeles"),
        ("Charlie", 22, "Chicago")]

columns = ["Nombre", "Edad", "Ciudad"]

df = spark.createDataFrame(data, columns)

df.show()

+-------+----+-----------+
| Nombre|Edad|     Ciudad|
+-------+----+-----------+
|  Alice|  25|   New York|
|    Bob|  30|Los Angeles|
|Charlie|  22|    Chicago|
+-------+----+-----------+



In [3]:
df.select("Nombre").show()

+-------+
| Nombre|
+-------+
|  Alice|
|    Bob|
|Charlie|
+-------+



In [4]:
df.filter(col("Edad") >= 25).show()

+------+----+-----------+
|Nombre|Edad|     Ciudad|
+------+----+-----------+
| Alice|  25|   New York|
|   Bob|  30|Los Angeles|
+------+----+-----------+



In [5]:
from pyspark.sql.functions import lit

df_with_country = df.withColumn("Pais", lit("USA"))
df_with_country.show()

+-------+----+-----------+----+
| Nombre|Edad|     Ciudad|Pais|
+-------+----+-----------+----+
|  Alice|  25|   New York| USA|
|    Bob|  30|Los Angeles| USA|
|Charlie|  22|    Chicago| USA|
+-------+----+-----------+----+



In [6]:
df.agg(avg("Edad")).show()

+------------------+
|         avg(Edad)|
+------------------+
|25.666666666666668|
+------------------+



In [7]:
df.orderBy(col("Edad").desc()).show()

+-------+----+-----------+
| Nombre|Edad|     Ciudad|
+-------+----+-----------+
|    Bob|  30|Los Angeles|
|  Alice|  25|   New York|
|Charlie|  22|    Chicago|
+-------+----+-----------+



In [8]:
df.groupBy("Ciudad").agg(count("Nombre").alias("CantidadPersonas")).show()

+-----------+----------------+
|     Ciudad|CantidadPersonas|
+-----------+----------------+
|Los Angeles|               1|
|    Chicago|               1|
|   New York|               1|
+-----------+----------------+



In [9]:
df_renamed = df.withColumnRenamed("Nombre", "NombreCompleto")
df_renamed.show()

+--------------+----+-----------+
|NombreCompleto|Edad|     Ciudad|
+--------------+----+-----------+
|         Alice|  25|   New York|
|           Bob|  30|Los Angeles|
|       Charlie|  22|    Chicago|
+--------------+----+-----------+



In [10]:
df_without_age = df.drop("Edad")
df_without_age.show()

+-------+-----------+
| Nombre|     Ciudad|
+-------+-----------+
|  Alice|   New York|
|    Bob|Los Angeles|
|Charlie|    Chicago|
+-------+-----------+



In [11]:
df.createOrReplaceTempView("personas")

result = spark.sql("SELECT * FROM personas WHERE Edad > 20")
result.show()

+-------+----+-----------+
| Nombre|Edad|     Ciudad|
+-------+----+-----------+
|  Alice|  25|   New York|
|    Bob|  30|Los Angeles|
|Charlie|  22|    Chicago|
+-------+----+-----------+



In [12]:
df.select("Edad").agg(sum("Edad")).show()

+---------+
|sum(Edad)|
+---------+
|       77|
+---------+



In [13]:
df.agg(min("Edad"), max("Edad")).show()

+---------+---------+
|min(Edad)|max(Edad)|
+---------+---------+
|       22|       30|
+---------+---------+



In [14]:
df.filter((col("Ciudad") == "Chicago") & (col("Edad") < 30)).show()

+-------+----+-------+
| Nombre|Edad| Ciudad|
+-------+----+-------+
|Charlie|  22|Chicago|
+-------+----+-------+



In [15]:
df_with_age_double = df.withColumn("EdadDuplicada", col("Edad") * 2)
df_with_age_double.show()


+-------+----+-----------+-------------+
| Nombre|Edad|     Ciudad|EdadDuplicada|
+-------+----+-----------+-------------+
|  Alice|  25|   New York|           50|
|    Bob|  30|Los Angeles|           60|
|Charlie|  22|    Chicago|           44|
+-------+----+-----------+-------------+



In [16]:
df_with_age_in_months = df.withColumn("EdadEnMeses", col("Edad") * 12)
df_with_age_in_months.show()

+-------+----+-----------+-----------+
| Nombre|Edad|     Ciudad|EdadEnMeses|
+-------+----+-----------+-----------+
|  Alice|  25|   New York|        300|
|    Bob|  30|Los Angeles|        360|
|Charlie|  22|    Chicago|        264|
+-------+----+-----------+-----------+



In [17]:
df.count()

3

In [18]:
df.filter(col("Edad") % 2 == 0).show()

+-------+----+-----------+
| Nombre|Edad|     Ciudad|
+-------+----+-----------+
|    Bob|  30|Los Angeles|
|Charlie|  22|    Chicago|
+-------+----+-----------+



In [19]:
from pyspark.sql.functions import StringType

def edad_categoria(edad):
    if edad <= 20:
        return "0-20"
    elif 21 <= edad <= 40:
        return "21-40"
    elif 41 <= edad <= 60:
        return "41-60"
    else:
        return "61+"

edad_categoria_udf = udf(edad_categoria, StringType())

df_with_category = df.withColumn("RangoEdad", edad_categoria_udf(col("Edad")))
df_with_category.groupBy("RangoEdad").agg(count("Nombre").alias("Cantidad")).show()

+---------+--------+
|RangoEdad|Cantidad|
+---------+--------+
|    21-40|       3|
+---------+--------+



In [20]:
df.groupBy("Nombre").agg(count("Nombre").alias("Cantidad")).show()

+-------+--------+
| Nombre|Cantidad|
+-------+--------+
|Charlie|       1|
|    Bob|       1|
|  Alice|       1|
+-------+--------+



In [21]:
from pyspark.sql.functions import concat_ws

df_with_info = df.withColumn("InformacionPersonal", concat_ws(" - ", col("Nombre"), col("Ciudad")))
df_with_info.show()

+-------+----+-----------+-------------------+
| Nombre|Edad|     Ciudad|InformacionPersonal|
+-------+----+-----------+-------------------+
|  Alice|  25|   New York|   Alice - New York|
|    Bob|  30|Los Angeles|  Bob - Los Angeles|
|Charlie|  22|    Chicago|  Charlie - Chicago|
+-------+----+-----------+-------------------+

