# **Filter a PySpark DataFrame to keep only the rows that fall within a specified range**

In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=65baaa56034334864ae38d152e9431d54501e9a455601ec301351401ae3871ed
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import row_number,col
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("qa").getOrCreate()

data = [("A", 28),
        ("B", 35),
        ("C", 42),
        ("D", 25),
        ("E", 31),
        ("F", 38),
        ("G", 45),
        ("H", 29)]


columns = ["name", "age"]
df = spark.createDataFrame(data, columns)


df.show()

+----+---+
|name|age|
+----+---+
|   A| 28|
|   B| 35|
|   C| 42|
|   D| 25|
|   E| 31|
|   F| 38|
|   G| 45|
|   H| 29|
+----+---+



In [6]:
window_spec = Window.orderBy("name")
df_with_index = df.withColumn("row_index", row_number().over(window_spec))

df_with_index.show()

+----+---+---------+
|name|age|row_index|
+----+---+---------+
|   A| 28|        1|
|   B| 35|        2|
|   C| 42|        3|
|   D| 25|        4|
|   E| 31|        5|
|   F| 38|        6|
|   G| 45|        7|
|   H| 29|        8|
+----+---+---------+



In [7]:
total_rows = df.count()


start_index = int(total_rows * 0.5)  # Start at 50%
end_index = int(total_rows * 0.75)   # End at 75%

# Filter rows within the specified range

filtered_df = df_with_index.filter((col("row_index") > start_index) & (col("row_index") <= end_index)).drop("row_index")

# Show the resulting DataFrame
filtered_df.show()

+----+---+
|name|age|
+----+---+
|   E| 31|
|   F| 38|
+----+---+

