In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql.functions import  *
from datetime import date
from pyspark.sql.types import * 
from pyspark.sql import Window
from datetime import datetime

In [2]:
spark = (
    SparkSession.builder
    .master("local")
    .appName("Exploring Joins")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()
)
sc = spark.sparkContext

In [3]:
data = spark.read.csv('pets.csv',header=True)

### Looking at your Data

In [4]:
# First method is collect method
data.collect()

[Row(id='1', breed_id='1', nickname='King', birthday='2014-11-22 12:30:31', age='5', color='brown', weight='10.0'),
 Row(id='2', breed_id='3', nickname='Argus', birthday='2016-11-22 10:05:10', age='10', color=None, weight='5.5'),
 Row(id='3', breed_id='1', nickname='Chewie', birthday='2016-11-22 10:05:10', age='15', color=None, weight='12'),
 Row(id='3', breed_id='2', nickname='Maple', birthday='2018-11-22 10:05:10', age='17', color='white', weight='3.4'),
 Row(id='4', breed_id='2', nickname=None, birthday='2019-01-01 10:05:10', age='13', color=None, weight='10')]

In [5]:
# Second method is toPandas method
data.toPandas()

Unnamed: 0,id,breed_id,nickname,birthday,age,color,weight
0,1,1,King,2014-11-22 12:30:31,5,brown,10.0
1,2,3,Argus,2016-11-22 10:05:10,10,,5.5
2,3,1,Chewie,2016-11-22 10:05:10,15,,12.0
3,3,2,Maple,2018-11-22 10:05:10,17,white,3.4
4,4,2,,2019-01-01 10:05:10,13,,10.0


In [6]:
# Third method is head method
data.head(n=3)

[Row(id='1', breed_id='1', nickname='King', birthday='2014-11-22 12:30:31', age='5', color='brown', weight='10.0'),
 Row(id='2', breed_id='3', nickname='Argus', birthday='2016-11-22 10:05:10', age='10', color=None, weight='5.5'),
 Row(id='3', breed_id='1', nickname='Chewie', birthday='2016-11-22 10:05:10', age='15', color=None, weight='12')]

In [7]:
data.show(n=3)

+---+--------+--------+-------------------+---+-----+------+
| id|breed_id|nickname|           birthday|age|color|weight|
+---+--------+--------+-------------------+---+-----+------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| NULL|    12|
+---+--------+--------+-------------------+---+-----+------+
only showing top 3 rows



### Selecting a Subset of Columns

In [8]:
data.select("id","breed_id","nickname","age","weight").toPandas()

Unnamed: 0,id,breed_id,nickname,age,weight
0,1,1,King,5,10.0
1,2,3,Argus,10,5.5
2,3,1,Chewie,15,12.0
3,3,2,Maple,17,3.4
4,4,2,,13,10.0


In [9]:
data.drop("birthday","color").toPandas()

Unnamed: 0,id,breed_id,nickname,age,weight
0,1,1,King,5,10.0
1,2,3,Argus,10,5.5
2,3,1,Chewie,15,12.0
3,3,2,Maple,17,3.4
4,4,2,,13,10.0


In [10]:
data.withColumn("n-nickname_copy",col("nickname")).\
    withColumn('nickname_capatilize',upper(col('nickname'))).toPandas()

Unnamed: 0,id,breed_id,nickname,birthday,age,color,weight,n-nickname_copy,nickname_capatilize
0,1,1,King,2014-11-22 12:30:31,5,brown,10.0,King,KING
1,2,3,Argus,2016-11-22 10:05:10,10,,5.5,Argus,ARGUS
2,3,1,Chewie,2016-11-22 10:05:10,15,,12.0,Chewie,CHEWIE
3,3,2,Maple,2018-11-22 10:05:10,17,white,3.4,Maple,MAPLE
4,4,2,,2019-01-01 10:05:10,13,,10.0,,


In [11]:
data.withColumnRenamed("id","pet_id").toPandas()

Unnamed: 0,pet_id,breed_id,nickname,birthday,age,color,weight
0,1,1,King,2014-11-22 12:30:31,5,brown,10.0
1,2,3,Argus,2016-11-22 10:05:10,10,,5.5
2,3,1,Chewie,2016-11-22 10:05:10,15,,12.0
3,3,2,Maple,2018-11-22 10:05:10,17,white,3.4
4,4,2,,2019-01-01 10:05:10,13,,10.0


### Constant Value and Column Expression

In [12]:
# what if we want to insert constant value to the new column

data.withColumn("today-date",date.today()).toPandas()

PySparkTypeError: [NOT_COLUMN] Argument `col` should be a Column, got date.

In [13]:
'''
Spark functions that have a col as an argument will usually require you to pass in a Column expression. As seen in the previous section, withColumn() worked fine when we gave it a column from the current df. But this isn't the case when we want set a column to a constant value.
'''
data.withColumn("today_date",lit(date.today())).show()

+---+--------+--------+-------------------+---+-----+------+----------+
| id|breed_id|nickname|           birthday|age|color|weight|today_date|
+---+--------+--------+-------------------+---+-----+------+----------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|2024-06-19|
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|2024-06-19|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| NULL|    12|2024-06-19|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|2024-06-19|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|2024-06-19|
+---+--------+--------+-------------------+---+-----+------+----------+



In [14]:
data.withColumn('height',lit(10)).show()

+---+--------+--------+-------------------+---+-----+------+------+
| id|breed_id|nickname|           birthday|age|color|weight|height|
+---+--------+--------+-------------------+---+-----+------+------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|    10|
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|    10|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| NULL|    12|    10|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|    10|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|    10|
+---+--------+--------+-------------------+---+-----+------+------+



In [15]:
data.withColumn('double_age',col('age')*2).show()

+---+--------+--------+-------------------+---+-----+------+----------+
| id|breed_id|nickname|           birthday|age|color|weight|double_age|
+---+--------+--------+-------------------+---+-----+------+----------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|      10.0|
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|      20.0|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| NULL|    12|      30.0|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|      34.0|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|      26.0|
+---+--------+--------+-------------------+---+-----+------+----------+



#### Casting_Column_to_Different_DataTypes

In [16]:
data.printSchema()

root
 |-- id: string (nullable = true)
 |-- breed_id: string (nullable = true)
 |-- nickname: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- age: string (nullable = true)
 |-- color: string (nullable = true)
 |-- weight: string (nullable = true)



In [17]:
data.select('birthday').withColumn("birthday_date",col('birthday').cast('date')).withColumn("birthday_date2",col('birthday').cast(DateType())).show()

+-------------------+-------------+--------------+
|           birthday|birthday_date|birthday_date2|
+-------------------+-------------+--------------+
|2014-11-22 12:30:31|   2014-11-22|    2014-11-22|
|2016-11-22 10:05:10|   2016-11-22|    2016-11-22|
|2016-11-22 10:05:10|   2016-11-22|    2016-11-22|
|2018-11-22 10:05:10|   2018-11-22|    2018-11-22|
|2019-01-01 10:05:10|   2019-01-01|    2019-01-01|
+-------------------+-------------+--------------+



In [18]:
data.select('birthday').withColumn("birthday",col('birthday').cast('date')).printSchema()

root
 |-- birthday: date (nullable = true)



### Filtering Data Where,Filter and is_in.

In [19]:
data.select("age","nickname").where(col("age")>3).show()

+---+--------+
|age|nickname|
+---+--------+
|  5|    King|
| 10|   Argus|
| 15|  Chewie|
| 17|   Maple|
| 13|    NULL|
+---+--------+



In [20]:
data.where(col("breed_id")==2).show()

+---+--------+--------+-------------------+---+-----+------+
| id|breed_id|nickname|           birthday|age|color|weight|
+---+--------+--------+-------------------+---+-----+------+
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|
+---+--------+--------+-------------------+---+-----+------+



In [21]:
data.filter(col('breed_id')==2).show()

+---+--------+--------+-------------------+---+-----+------+
| id|breed_id|nickname|           birthday|age|color|weight|
+---+--------+--------+-------------------+---+-----+------+
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|
+---+--------+--------+-------------------+---+-----+------+



In [22]:
data.filter(col('breed_id').isin(2,3)).show()

+---+--------+--------+-------------------+---+-----+------+
| id|breed_id|nickname|           birthday|age|color|weight|
+---+--------+--------+-------------------+---+-----+------+
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|
+---+--------+--------+-------------------+---+-----+------+



In [23]:
data.where(col('nickname').isin("King","Argus")).show()

+---+--------+--------+-------------------+---+-----+------+
| id|breed_id|nickname|           birthday|age|color|weight|
+---+--------+--------+-------------------+---+-----+------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|
+---+--------+--------+-------------------+---+-----+------+



### Equality Statememt in PySpark

In [24]:
data.where((col('breed_id')>=lit(1)) & (col('breed_id')<lit(3))).show()

+---+--------+--------+-------------------+---+-----+------+
| id|breed_id|nickname|           birthday|age|color|weight|
+---+--------+--------+-------------------+---+-----+------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| NULL|    12|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|
+---+--------+--------+-------------------+---+-----+------+



In [25]:
data.filter((col('breed_id')>=1) & (col('breed_id')<=3)).show()

+---+--------+--------+-------------------+---+-----+------+
| id|breed_id|nickname|           birthday|age|color|weight|
+---+--------+--------+-------------------+---+-----+------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| NULL|    12|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|
+---+--------+--------+-------------------+---+-----+------+



In [26]:
data.filter(col('breed_id').isin(2,3)).show()

+---+--------+--------+-------------------+---+-----+------+
| id|breed_id|nickname|           birthday|age|color|weight|
+---+--------+--------+-------------------+---+-----+------+
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|
+---+--------+--------+-------------------+---+-----+------+



In [27]:
data.filter((col("breed_id").isin(1,2)) & (col('nickname').isNotNull())|col('color').isin("white")).show()

+---+--------+--------+-------------------+---+-----+------+
| id|breed_id|nickname|           birthday|age|color|weight|
+---+--------+--------+-------------------+---+-----+------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| NULL|    12|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|
+---+--------+--------+-------------------+---+-----+------+



In [28]:
data.withColumn("result",col("color")!="white").\
    withColumn("result2",(col('color')!="white")& (col('color').isNotNull())).show()

+---+--------+--------+-------------------+---+-----+------+------+-------+
| id|breed_id|nickname|           birthday|age|color|weight|result|result2|
+---+--------+--------+-------------------+---+-----+------+------+-------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|  true|   true|
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|  NULL|  false|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| NULL|    12|  NULL|  false|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4| false|  false|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|  NULL|  false|
+---+--------+--------+-------------------+---+-----+------+------+-------+



### Case Statements

In [29]:
data.withColumn('oldness_value',when(col('age')<=5,"young").when((col('age')>=5) & (col('age')<=13),"middle_age").otherwise("old")).show()

+---+--------+--------+-------------------+---+-----+------+-------------+
| id|breed_id|nickname|           birthday|age|color|weight|oldness_value|
+---+--------+--------+-------------------+---+-----+------+-------------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|        young|
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|   middle_age|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| NULL|    12|          old|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|          old|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|   middle_age|
+---+--------+--------+-------------------+---+-----+------+-------------+



In [30]:
data.withColumn('race',when(col('color')=='brown',"black_dog").when(col('color')=='white','white_dog').otherwise('black_white_dog')).show()

+---+--------+--------+-------------------+---+-----+------+---------------+
| id|breed_id|nickname|           birthday|age|color|weight|           race|
+---+--------+--------+-------------------+---+-----+------+---------------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|      black_dog|
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|black_white_dog|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| NULL|    12|black_white_dog|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|      white_dog|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|black_white_dog|
+---+--------+--------+-------------------+---+-----+------+---------------+



In [31]:
data.select("nickname","weight").withColumn("weight_value",when(col("weight")<5,"under_weight").when((col('weight')>5) & (col('weight')<=10),"normal_weight").\
                                   otherwise("over_weight")).show()

+--------+------+-------------+
|nickname|weight| weight_value|
+--------+------+-------------+
|    King|  10.0|normal_weight|
|   Argus|   5.5|  over_weight|
|  Chewie|    12|  over_weight|
|   Maple|   3.4| under_weight|
|    NULL|    10|normal_weight|
+--------+------+-------------+



### Fill nan and null values

In [32]:
data.fillna("Ngawang").toPandas()

Unnamed: 0,id,breed_id,nickname,birthday,age,color,weight
0,1,1,King,2014-11-22 12:30:31,5,brown,10.0
1,2,3,Argus,2016-11-22 10:05:10,10,Ngawang,5.5
2,3,1,Chewie,2016-11-22 10:05:10,15,Ngawang,12.0
3,3,2,Maple,2018-11-22 10:05:10,17,white,3.4
4,4,2,Ngawang,2019-01-01 10:05:10,13,Ngawang,10.0


In [33]:
'''
You have the option of filling in each column with a diffferent value. This provides more flexibility as most times the columns will be different types and a single deafult value won't be sufficient enough.
'''
data.fillna({"nickname":"Ngawang","color":"Anurag"}).show()

+---+--------+--------+-------------------+---+------+------+
| id|breed_id|nickname|           birthday|age| color|weight|
+---+--------+--------+-------------------+---+------+------+
|  1|       1|    King|2014-11-22 12:30:31|  5| brown|  10.0|
|  2|       3|   Argus|2016-11-22 10:05:10| 10|Anurag|   5.5|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15|Anurag|    12|
|  3|       2|   Maple|2018-11-22 10:05:10| 17| white|   3.4|
|  4|       2| Ngawang|2019-01-01 10:05:10| 13|Anurag|    10|
+---+--------+--------+-------------------+---+------+------+



In [34]:
'''
Another way to fill in a column with values is using coalesce(). This function will try to fill in the specified columns by looking at the given arguments in order from left to right, until one of the arguments is not null and use that. If all else fails, you can provide a "default" value as your last arugment (remembering that it should be a columnar expression).
'''
data.withColumn('xyz',coalesce(col('nickname'),col('color'),lit('default'))).show()

+---+--------+--------+-------------------+---+-----+------+-------+
| id|breed_id|nickname|           birthday|age|color|weight|    xyz|
+---+--------+--------+-------------------+---+-----+------+-------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|   King|
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|  Argus|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| NULL|    12| Chewie|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|  Maple|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|default|
+---+--------+--------+-------------------+---+-----+------+-------+



### User_Defined_Function

In [35]:
from pyspark.sql.functions import udf

@udf('string')
def uppercase(word):
    return word.upper()[:2] if word else None

data.withColumn('uppercase',uppercase(col('nickname'))).show()



+---+--------+--------+-------------------+---+-----+------+---------+
| id|breed_id|nickname|           birthday|age|color|weight|uppercase|
+---+--------+--------+-------------------+---+-----+------+---------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|       KI|
|  2|       3|   Argus|2016-11-22 10:05:10| 10| NULL|   5.5|       AR|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| NULL|    12|       CH|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|       MA|
|  4|       2|    NULL|2019-01-01 10:05:10| 13| NULL|    10|     NULL|
+---+--------+--------+-------------------+---+-----+------+---------+



In [36]:
data.toPandas()

Unnamed: 0,id,breed_id,nickname,birthday,age,color,weight
0,1,1,King,2014-11-22 12:30:31,5,brown,10.0
1,2,3,Argus,2016-11-22 10:05:10,10,,5.5
2,3,1,Chewie,2016-11-22 10:05:10,15,,12.0
3,3,2,Maple,2018-11-22 10:05:10,17,white,3.4
4,4,2,,2019-01-01 10:05:10,13,,10.0


### Aggregation Function

In [37]:
data.groupBy("breed_id").agg({"*":"count","age":"avg","weight":"avg"}).show()

+--------+-----------+--------+--------+
|breed_id|avg(weight)|count(1)|avg(age)|
+--------+-----------+--------+--------+
|       3|        5.5|       1|    10.0|
|       1|       11.0|       2|    10.0|
|       2|        6.7|       2|    15.0|
+--------+-----------+--------+--------+



In [38]:
data.groupBy("breed_id").agg(count("age").alias("count_breed"),sum("age").alias("age_sum"),avg("age").alias("age_avg")).show()

+--------+-----------+-------+-------+
|breed_id|count_breed|age_sum|age_avg|
+--------+-----------+-------+-------+
|       3|          1|   10.0|   10.0|
|       1|          2|   20.0|   10.0|
|       2|          2|   30.0|   15.0|
+--------+-----------+-------+-------+



In [39]:
data.groupby("breed_id").agg(count("*").alias("count_breed"),sum("weight").alias("weight_sum"),avg("weight").alias("average_weight")).show()

+--------+-----------+----------+--------------+
|breed_id|count_breed|weight_sum|average_weight|
+--------+-----------+----------+--------------+
|       3|          1|       5.5|           5.5|
|       1|          2|      22.0|          11.0|
|       2|          2|      13.4|           6.7|
+--------+-----------+----------+--------------+



### Non Deterministic Ordering for GroupBys

In [40]:
data.printSchema()

root
 |-- id: string (nullable = true)
 |-- breed_id: string (nullable = true)
 |-- nickname: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- age: string (nullable = true)
 |-- color: string (nullable = true)
 |-- weight: string (nullable = true)



In [41]:
data = data.withColumn("birthday",col("birthday").cast("date"))

In [42]:
data.show()

+---+--------+--------+----------+---+-----+------+
| id|breed_id|nickname|  birthday|age|color|weight|
+---+--------+--------+----------+---+-----+------+
|  1|       1|    King|2014-11-22|  5|brown|  10.0|
|  2|       3|   Argus|2016-11-22| 10| NULL|   5.5|
|  3|       1|  Chewie|2016-11-22| 15| NULL|    12|
|  3|       2|   Maple|2018-11-22| 17|white|   3.4|
|  4|       2|    NULL|2019-01-01| 13| NULL|    10|
+---+--------+--------+----------+---+-----+------+



In [43]:
data.orderBy("birthday").groupBy('breed_id').agg(first('nickname').alias('first_breed')).toPandas()

Unnamed: 0,breed_id,first_breed
0,1,King
1,2,Maple
2,3,Argus


In [44]:
data.orderBy("birthday").groupBy('breed_id').agg(first('nickname').alias('first_breed')).toPandas()

Unnamed: 0,breed_id,first_breed
0,1,King
1,2,Maple
2,3,Argus


In [45]:
window = Window.partitionBy("breed_id").orderBy("birthday")

In [46]:
data.withColumn('row_number',row_number().over(window)).show()

+---+--------+--------+----------+---+-----+------+----------+
| id|breed_id|nickname|  birthday|age|color|weight|row_number|
+---+--------+--------+----------+---+-----+------+----------+
|  1|       1|    King|2014-11-22|  5|brown|  10.0|         1|
|  3|       1|  Chewie|2016-11-22| 15| NULL|    12|         2|
|  3|       2|   Maple|2018-11-22| 17|white|   3.4|         1|
|  4|       2|    NULL|2019-01-01| 13| NULL|    10|         2|
|  2|       3|   Argus|2016-11-22| 10| NULL|   5.5|         1|
+---+--------+--------+----------+---+-----+------+----------+



In [47]:

pets = spark.createDataFrame(
    [
        (1, 1, datetime(2018, 1, 1, 1 ,1, 1), 'Bear', 5),
        (2, 1, datetime(2010, 1, 1, 1 ,1, 1), 'Chewie', 15),
        (3, 1, datetime(2015, 1, 1, 1 ,1, 1), 'Roger', 10),
        (4, 2, datetime(2015, 1, 2, 2 ,3, 4), 'Roger', 10),
        (5, 2, datetime(2015, 5, 6, 7 ,8, 9), 'Roger', 10),
    ], ['id', 'breed_id', 'birthday', 'nickname', 'age']
)

pets.toPandas()

Unnamed: 0,id,breed_id,birthday,nickname,age
0,1,1,2018-01-01 01:01:01,Bear,5
1,2,1,2010-01-01 01:01:01,Chewie,15
2,3,1,2015-01-01 01:01:01,Roger,10
3,4,2,2015-01-02 02:03:04,Roger,10
4,5,2,2015-05-06 07:08:09,Roger,10


In [48]:
windows = Window.partitionBy("breed_id").orderBy("age")

In [49]:
pets.withColumn('row_number',row_number().over(windows)).show()

+---+--------+-------------------+--------+---+----------+
| id|breed_id|           birthday|nickname|age|row_number|
+---+--------+-------------------+--------+---+----------+
|  1|       1|2018-01-01 01:01:01|    Bear|  5|         1|
|  3|       1|2015-01-01 01:01:01|   Roger| 10|         2|
|  2|       1|2010-01-01 01:01:01|  Chewie| 15|         3|
|  4|       2|2015-01-02 02:03:04|   Roger| 10|         1|
|  5|       2|2015-05-06 07:08:09|   Roger| 10|         2|
+---+--------+-------------------+--------+---+----------+



In [50]:
windows2 = Window.partitionBy("breed_id").orderBy("id")

In [53]:
pets.withColumn('sum_age',sum("age").over(windows2)).show()

+---+--------+-------------------+--------+---+-------+
| id|breed_id|           birthday|nickname|age|sum_age|
+---+--------+-------------------+--------+---+-------+
|  1|       1|2018-01-01 01:01:01|    Bear|  5|      5|
|  2|       1|2010-01-01 01:01:01|  Chewie| 15|     20|
|  3|       1|2015-01-01 01:01:01|   Roger| 10|     30|
|  4|       2|2015-01-02 02:03:04|   Roger| 10|     10|
|  5|       2|2015-05-06 07:08:09|   Roger| 10|     20|
+---+--------+-------------------+--------+---+-------+



In [54]:
windows3 = Window.partitionBy("breed_id")
pets.withColumn('sum_all',sum("age").over(windows3)).show()

+---+--------+-------------------+--------+---+-------+
| id|breed_id|           birthday|nickname|age|sum_all|
+---+--------+-------------------+--------+---+-------+
|  1|       1|2018-01-01 01:01:01|    Bear|  5|     30|
|  2|       1|2010-01-01 01:01:01|  Chewie| 15|     30|
|  3|       1|2015-01-01 01:01:01|   Roger| 10|     30|
|  4|       2|2015-01-02 02:03:04|   Roger| 10|     20|
|  5|       2|2015-05-06 07:08:09|   Roger| 10|     20|
+---+--------+-------------------+--------+---+-------+

