[Reference](https://viv1kv.medium.com/pyspark-or-polars-what-should-you-use-breakdown-of-similarities-and-differences-b261a825b9d6)

In [5]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import polars as pl

# Create Polars DataFrames
data1 = {"id": [1, 2, 3, 4],
         "age": [25, 30, 35, 40],
         "salary": [50000, 55000, 60000, 65000]}
data2 = {"id": [1, 2, 3, 4],
         "city": ["New York", "San Francisco", "Los Angeles", "Chicago"]}

df1_polars = pl.DataFrame(data1)
df2_polars = pl.DataFrame(data2)

# Perform operations
selected_df = df1_polars.select(["id", "salary"])
filtered_df = selected_df.filter(pl.col("salary") > 50000)
renamed_df = filtered_df.rename({"salary": "income"})
joined_df = renamed_df.join(df2_polars, on="id", how="inner")
conditional_df = joined_df.with_columns(pl.when(joined_df["income"] > 60000).then(1).otherwise(0).alias("high_income"))

# Apply UDF
def salary_increase(salary: int) -> int:
    return salary + 5000

udf_applied_df = conditional_df.with_columns(pl.col("income").apply(salary_increase).alias("increased_income"))

# Window function
grouped_df = udf_applied_df.groupby("city")
ranked_df = grouped_df.agg(pl.col("income").mean().alias("average_income"),
                            pl.col("increased_income").mean().alias("average_increased_income"))
sorted_df = ranked_df.sort(by=["average_income"], descending=True)

# Show the resulting DataFrame
print(sorted_df)

shape: (3, 3)
┌───────────────┬────────────────┬──────────────────────────┐
│ city          ┆ average_income ┆ average_increased_income │
│ ---           ┆ ---            ┆ ---                      │
│ str           ┆ f64            ┆ f64                      │
╞═══════════════╪════════════════╪══════════════════════════╡
│ Chicago       ┆ 65000.0        ┆ 70000.0                  │
│ Los Angeles   ┆ 60000.0        ┆ 65000.0                  │
│ San Francisco ┆ 55000.0        ┆ 60000.0                  │
└───────────────┴────────────────┴──────────────────────────┘


In [7]:
# from pyspark.sql import SparkSession, Row
# from pyspark.sql.functions import avg, col, when, row_number
# from pyspark.sql.window import Window

# # Create a Spark session
# spark = SparkSession.builder \
#     .appName("PySpark Example") \
#     .getOrCreate()

# # Create PySpark DataFrames
# data1 = [Row(id=1, age=25, salary=50000),
#          Row(id=2, age=30, salary=55000),
#          Row(id=3, age=35, salary=60000),
#          Row(id=4, age=40, salary=65000)]
# data2 = [Row(id=1, city="New York"),
#          Row(id=2, city="San Francisco"),
#          Row(id=3, city="Los Angeles"),
#          Row(id=4, city="Chicago")]

# df1_pyspark = spark.createDataFrame(data1)
# df2_pyspark = spark.createDataFrame(data2)

# # Perform operations
# selected_df = df1_pyspark.select("id", "salary")
# filtered_df = selected_df.filter(col("salary") > 50000)
# renamed_df = filtered_df.withColumnRenamed("salary", "income")
# joined_df = renamed_df.join(df2_pyspark, on="id", how="inner")
# conditional_df = joined_df.withColumn("high_income", when(col("income") > 60000, 1).otherwise(0))

# def salary_increase(salary: int) -> int:
#   return salary + 5000

# from pyspark.sql.functions import udf
# from pyspark.sql.types import IntegerType
# salary_increase_udf = udf(salary_increase, IntegerType())
# udf_applied_df = conditional_df.withColumn("increased_income", salary_increase_udf(col("income")))

# window_spec = Window.orderBy("id")
# ranked_df = udf_applied_df.withColumn("rank", row_number().over(window_spec))

# #GroupBy and aggregation
# result_df = (ranked_df.groupBy("city")
# .agg(avg("income").alias("average_income"),
# avg("increased_income").alias("average_increased_income"))
# .sort("average_income", ascending=False))

# #Show the resulting DataFrame
# result_df.show()

# #Stop the Spark session
# spark.stop()

In [8]:
# import timeit
# import random
# import string

# # Random Data - I am using One Million Rows for this experiment.
# num_rows = 1000000
# ages = [random.randint(18, 65) for _ in range(num_rows)]
# salaries = [random.randint(30000, 200000) for _ in range(num_rows)]
# cities = [random.choice(["New York", "San Francisco", "Los Angeles", "Chicago"]) for _ in range(num_rows)]

# data1 = [{"id": i, "age": age, "salary": salary} for i, (age, salary) in enumerate(zip(ages, salaries), start=1)]
# data2 = [{"id": i, "city": city} for i, city in enumerate(cities, start=1)]
    
# def pyspark_benchmark():
#     from pyspark.sql import Row, SparkSession
#     from pyspark.sql.functions import avg, col, when, row_number
#     from pyspark.sql.window import Window

#     spark = SparkSession.builder.appName("PySpark Benchmark").getOrCreate()
#     df1_pyspark = spark.createDataFrame([Row(**row) for row in data1])
#     df2_pyspark = spark.createDataFrame([Row(**row) for row in data2])

#     joined_df = df1_pyspark.join(df2_pyspark, on="id", how="inner")
#     conditional_df = joined_df.withColumn("high_income", when(col("salary") > 100000, 1).otherwise(0))
#     window_spec = Window.orderBy("id")
#     ranked_df = conditional_df.withColumn("rank", row_number().over(window_spec))
#     result_df = (ranked_df.groupBy("city")
#     .agg(avg("salary").alias("average_salary"))
#     .orderBy("average_salary", ascending=False)
#     .limit(10))
    
#     result_df.show()

# def polars_benchmark():
#     import polars as pl

#     df1_polars = pl.DataFrame(data1)
#     df2_polars = pl.DataFrame(data2)

#     joined_df = df1_polars.join(df2_polars, on="id", how="inner")
#     conditional_df = joined_df.with_columns(pl.when(joined_df["salary"] > 100000).then(1).otherwise(0).alias("high_income"))
#     ranked_df = conditional_df.with_columns(pl.col("id").rank().over("id").alias("rank"))
#     result_df = (ranked_df.groupby("city")
#                  .agg(pl.col("salary").mean().alias("average_salary"))
#                  .sort("average_salary", descending=True)
#                  .head(10))

#     print(result_df)


# pyspark_time = timeit.timeit("pyspark_benchmark()", globals=globals(), number=1)
# polars_time = timeit.timeit("polars_benchmark()", globals=globals(), number=1)

# print(f"PySpark execution time: {pyspark_time:.2f} seconds")
# print(f"Polars execution time: {polars_time:.2f} seconds")