In [0]:
from pyspark.sql import functions a

data = [("A", 2), ("B", 3), ("C", 1)]
df = spark.createDataFrame(data, ["Item_Name", "Count"])

result = df.withColumn('seq', F.sequence(F.lit(1), F.col('Count'))).withColumn('Item_Name', F.explode(F.col('seq')))

result.show()

In [0]:
from pyspark.sql import functions as F

data = [1,2,3]
df = spark.createDataFrame(data, ["Item_No"])
display(df)

In [0]:
df.createOrReplaceTempView('numbers')

In [0]:
%sql
select Item_No, 1 as rank from numbers

In [0]:
from pyspark.sql.functions import to_date

data = [
    ('2026-01-01', 'Success'),
    ('2026-01-02', 'Success'),
    ('2026-01-03', 'Success'),
    ('2026-01-04', 'Fail'),
    ('2026-01-05', 'Fail'),
    ('2026-01-06', 'Fail'),
    ('2026-01-07', 'Success'),
    ('2026-01-08', 'Success')
]

df = spark.createDataFrame(data, ['event_date', 'event_status']).withColumn('event_date', to_date('event_date', 'yyyy-MM-dd'))
display(df)

In [0]:
df.createOrReplaceTempView('status')

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import *

w = Window.orderBy(col('event_date'))
r_df = df.withColumn('status_flag', when(col('event_status') != lag(col('event_status')).over(w), 1).otherwise(0)).withColumn('grp', sum(col('status_flag')).over(w))

final_df = r_df.groupBy(col('event_status'),'grp').agg(min('event_date').alias('start_date'), max('event_date').alias('end_date')).select('start_date','end_date')

final_df.display()

In [0]:
%sql

with status_flag as (
    select event_date, event_status,
    case when event_status != lag(event_status)over(order by event_date) then 1 else 0 end as flag from status
)
,grouped as (
    select event_date, event_status,
    sum(flag)over(order by event_date) as grp from status_flag
)

select min(event_date) as start_date, max(event_date) as end_date from grouped
group by event_status, grp
order by start_date

In [0]:
%sql
with recursive cte as (
    select Item_No, 1 as rank from numbers
    union all
    select Item_No, rank+1 from cte where rank<item_no
)
select Item_no from cte order by 1

In [0]:
from pyspark.sql import functions as F

data = [("A", 2), ("B", 3), ("C", 1)]
df = spark.createDataFrame(data, ["Item_Name", "Count"])

result = (
    df
    .withColumn("seq", F.sequence(F.lit(1), F.col("Count")))
    .withColumn("dummy", F.explode("seq"))
    .select(F.col('Item_Name'))
)

result.show()


In [0]:
id | event_date
1  | 2024-01-15
2  | 2024-03-10
3  | 2024-03-25


In [0]:
from pyspark.sql.functions import *

data = [(1, '2024-01-15'), (2, '2024-03-10'), (3, '2024-03-25')]
df = spark.createDataFrame(data, ["id", "event_date"])
df1 = df.withColumn('event_date', to_date('event_date', 'yyyy-MM-dd'))

result_df = df1.withColumn('month', month('event_date')).groupBy('month').agg(count('*').alias('cnt')).select('month','cnt')

result_df.show()



In [0]:
from pyspark.sql.functions import *

data = [(1, 'raju@gmail.com'), (2, 'angababu'), (3, 'jahd@kdd.com')]
df = spark.createDataFrame(data, ["id", "email"])

result_df = df.filter(col('email').rlike('^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z]'))

result_df.show()

In [0]:
%r

with cte as (
  select emp_id, dept, salary,
  row_number()
)

from pyspark.sql.functions import Window, row_number()

window = Window.partitionBy('dept').orderBy('salary',ascending=False)

result_df = emp_df.withColumn('rn',row_number().over(window)).filter('rn' == 1).drop('rn')

In [0]:
| order_id | customer_id | order_date | amount |
| -------- | ----------- | ---------- | ------ |
| 1        | C1          | 2024-01-01 | 100    |
| 2        | C1          | 2024-01-10 | 200    |
| 3        | C1          | 2024-02-05 | 300    |
| 4        | C2          | 2024-01-03 | 150    |
| 5        | C2          | 2024-01-20 | 250    |

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.window import Window

schema = StructType([
    StructField('order_id', IntegerType(), True),
    StructField('customer_id', StringType(), True),
    StructField('order_date', StringType(), True),
    StructField('amount', IntegerType(), True)
])

data = [
    (1, 'C1', '2024-01-01', 100),
    (2, 'C1', '2024-01-10', 200),
    (3, 'C1', '2024-02-05', 300),
    (4, 'C2', '2024-01-03', 150),
    (5, 'C2', '2024-01-20', 250)
]
df = spark.createDataFrame(data, schema)

customer_df = df.withColumn('order_date', to_date('order_date', 'yyyy-MM-dd'))


month_df = customer_df.withColumn('month',date_format(col('order_date'), 'MM'))


window = Window.partitionBy(col('customer_id'),col('month')).orderBy(col('order_date'))

result_df = month_df.withColumn('rn',row_number().over(window)).filter(col('rn')==1).drop(col('rn')).select('customer_id','order_date','amount')

result_df.show()


In [0]:
month_df.createOrReplaceTempView('month_df')


In [0]:
%sql

with cte as (
  select customer_id, order_date, amount,
  row_number() over(partition by customer_id, month order by order_date) as rn
  from month_df
)
select customer_id, order_date, amount from cte where rn = 1

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.window import Window
schema = StructType([
    StructField('user_id', StringType(), True),
    StructField('txn_date', StringType(), True),
    StructField('amount', IntegerType(), True)
])

data = [
    ('U1', '2024-01-01', 100),
    ('U1', '2024-01-05', 200),
    ('U1', '2024-01-10', 300),
    ('U2', '2024-01-03', 150),
    ('U2', '2024-01-20', 50)
]

users = spark.createDataFrame(data, schema)

result_df = users.withColumn('cum_sum', sum(col('amount')).over(Window.partitionBy(col('user_id')).orderBy(col('txn_date')))).filter(col('cum_sum')>=250).select('user_id','txn_date','amount').limit(1)

result_df.show()

In [0]:
users.createOrReplaceTempView('users')

In [0]:
%sql

with cte as (
select *,
sum(amount)over(partition by user_id order by txn_date) as cum_sum
from users
)
select user_id, txn_date, amount from cte where cum_sum>250
order by txn_date
LIMIT 1

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

schema = StructType([StructField("order_id", IntegerType()), StructField("items", StringType())])
data = [
    (1, "A,B,C"),
    (2, "B,C"),
    (3, "A,B"),
    (4, "C")
]
df_orders = spark.createDataFrame(data, ["order_id", "items"])

# Split the items string into an array before exploding
final_df = df_orders.withColumn('item', explode(split(col('items'), ',')))

result_df = final_df.groupBy('item').agg(count('*').alias('cnt')).select('item','cnt').orderBy('item')

result_df.show()


In [0]:
df_orders.createOrReplaceTempView('orders')

In [0]:
%sql
with cte as (
  select *,
    split(items, ',') as items_array
  from orders
)
select *
from cte

In [0]:
from pyspark.sql.functions import *

data = [
    ('A', 'Jan', 100),
    ('A', 'Feb', 200),
    ('B', 'Jan', 150),
    ('B', 'Feb', 300),
    ('C', 'Jan', 250)
]
columns = ['product', 'month', 'revenue']
df_products = spark.createDataFrame(data, columns)
result_df = df_products.groupBy('product').agg(sum(when(col('month')=='Jan', col('revenue')).otherwise(0)).alias('Jan'), sum(when(col('month')=='Feb', col('revenue')).otherwise(0)).alias('Feb'))
result_df.show()

In [0]:
result_df = df_products.groupBy('product').agg(sum(when(col('month')=='Jan', col('revenue')).otherwise(0)).alias('Jan'), sum(when(col('month')=='Feb', col('revenue')).otherwise(0)).alias('Feb'))
result_df.show()

In [0]:
| product | Jan | Feb |
| ------- | --- | --- |
| A       | 100 | 200 |
| B       | 150 | 300 |
| C       | 250 | 0   |


In [0]:
df_products.createOrReplaceTempView('products')
display(spark.sql('select * from products'))

In [0]:
%sql
select product,
sum(case when month = 'Jan' then revenue else 0 end) as Jan,
sum(case when month = 'Feb' then revenue else 0 end) as Feb
from products
group by product

In [0]:
from pyspark.sql.functions import *

data = [(1, 'Durga-Prasad'), (2, 'Ranga-Babu'), (3, 'Prakash Babu'), (4, 'Murali Babu')]
df = spark.createDataFrame(data, ["id", "Name"])
df.display()

In [0]:
from pyspark.sql.functions import col, regexp_replace, split

result_df = df.withColumn("First_Name", split(col('Name'), '[ -]')[0]).withColumn("Last_Name", split(col('Name'),'[ -]')[1])
result_df.display()

In [0]:
from pyspark.sql.functions import *

data = [('abc', 'MATH', 98), ('abc', 'phy', 96), ('abc', 'che', 87), ('def', 'che', 98), ('def', 'MATH', 76)]
Sub_df = spark.createDataFrame(data, ["Name", "Sub", "marks"])
Sub_df.display()
Sub_df.createOrReplaceTempView('Subjects')

In [0]:
from pyspark.sql.functions import *

final_df = Sub_df.groupBy('Name').pivot('Sub').sum('marks')
final_df.display()

In [0]:
%sql

select * from Subjects
PIVOT(sum(marks) for Sub in ('MATH','phy','che'))

In [0]:
str = 'peorun hajdytr qazxsw lapdoyrtwnvm lkmjniu 123 azxcvbgdfsteyruik'

seen = []
for ch in str:
    if ch.isalpha() and ch not in seen:
        seen.append(ch)

print(seen)
if len(seen) == 26:
    print("String is a pangram")
        

