In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import *
from pyspark.sql.window import *

# Initialize Spark session
spark = SparkSession.builder.appName("Sachin_DataFrame").getOrCreate()

# Define the schema
schema = StructType([
    StructField("match_no", IntegerType(), True),
    StructField("runs_scored", IntegerType(), True),
    StructField("status", StringType(), True)
])

# Define the data
data = [
    (1, 53, "out"),
    (2, 59, "not-out"),
    (3, 113, "out"),
    (4, 29, "out"),
    (5, 0, "out"),
    (6, 39, "out"),
    (7, 73, "out"),
    (8, 149, "out"),
    (9, 93, "out"),
    (10, 25, "out")
]

# Create the DataFrame
sachin = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
sachin.show()


+--------+-----------+-------+
|match_no|runs_scored| status|
+--------+-----------+-------+
|       1|         53|    out|
|       2|         59|not-out|
|       3|        113|    out|
|       4|         29|    out|
|       5|          0|    out|
|       6|         39|    out|
|       7|         73|    out|
|       8|        149|    out|
|       9|         93|    out|
|      10|         25|    out|
+--------+-----------+-------+



In [0]:
window_spec = Window.orderBy(col("match_no")).rowsBetween(Window.unboundedPreceding, Window.currentRow)
window_spec_avg = Window.orderBy(col("match_no")).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
window_spec_out=Window.partitionBy(col("status"))

ans_df=sachin.withColumn("running_sum",sum("runs_scored").over(window_spec)) \
    .withColumn("total_run",sum("runs_scored").over(window_spec_avg)) \
        .withColumn("out",when(col("status")=="out",lit(1)).otherwise(lit(0))) \
            .withColumn("total_out",count(col("status")).over(window_spec_out)) \
                .filter(col("running_sum")>=500) \
                    .withColumn("rn",row_number().over(Window.orderBy(col("match_no")))) \
                        .filter(col("rn")==1) \
                            .withColumn("total_run",col("total_run").cast(IntegerType())) \
                                .withColumn("total_out",col("total_out").cast(IntegerType())) \
                                     .select("match_no", round((col("total_run") / col("total_out")), 2).alias("batting_average"))
ans_df.display()


match_no,batting_average
8,70.33


match_no,batting_average
8,70.33
