In [1]:
import pandas as pd
import numpy as np

from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession, functions as F, Row, Column
from pyspark.sql.functions import datediff, unix_timestamp, from_unixtime, rank, first, window, col, avg, count

from pyspark.sql.types import DateType, StringType, StructField, IntegerType, FloatType, StructType
from datetime import date
from pyspark.sql.window import Window

from pyspark.sql.functions import udf

sc = SparkContext()
sqlContext = SQLContext(sc)

In [2]:
def createtestdate():
    _schema = StructType([StructField("date", StringType(), True),
                                   StructField("var1", IntegerType(), True),
                             StructField("var2", StringType(), True)])


    test_list = [('2017-01-30',123,'A'),
    ('2017-01-17',123,'B'),
    ('2017-01-15',123,'A'),
    ('2017-01-15',123,'A'),
    ('2017-01-14',123,'A'),
    ('2017-01-11',123,'B'),
    ('2017-01-29',456,'A'),
    ('2017-01-22',789,'B'),
    ('2017-01-21',789,'B'),
    ('2017-01-20',789,'A'),
    ('2017-01-19',789,'A')

    ]

    df = sqlContext.createDataFrame(test_list,schema=_schema) 
    df=(df.withColumn('date',df.date.cast(DateType())))

    #add this to have a numeric to use below 
    df=(df.withColumn('dayssinceJan11900',datediff(df.date,F.lit(date(1900, 1, 1)))))
    df.show(10)
    return df
    
df=createtestdate()

+----------+----+----+-----------------+
|      date|var1|var2|dayssinceJan11900|
+----------+----+----+-----------------+
|2017-01-30| 123|   A|            42763|
|2017-01-17| 123|   B|            42750|
|2017-01-15| 123|   A|            42748|
|2017-01-15| 123|   A|            42748|
|2017-01-14| 123|   A|            42747|
|2017-01-11| 123|   B|            42744|
|2017-01-29| 456|   A|            42762|
|2017-01-22| 789|   B|            42755|
|2017-01-21| 789|   B|            42754|
|2017-01-20| 789|   A|            42753|
+----------+----+----+-----------------+
only showing top 10 rows



In [3]:
df.registerTempTable("tbl_1")
query= 'select * from tbl_1'

sqlContext.sql(query).show()

+----------+----+----+-----------------+
|      date|var1|var2|dayssinceJan11900|
+----------+----+----+-----------------+
|2017-01-30| 123|   A|            42763|
|2017-01-17| 123|   B|            42750|
|2017-01-15| 123|   A|            42748|
|2017-01-15| 123|   A|            42748|
|2017-01-14| 123|   A|            42747|
|2017-01-11| 123|   B|            42744|
|2017-01-29| 456|   A|            42762|
|2017-01-22| 789|   B|            42755|
|2017-01-21| 789|   B|            42754|
|2017-01-20| 789|   A|            42753|
|2017-01-19| 789|   A|            42752|
+----------+----+----+-----------------+



In [69]:
#count number of times each user had event in past 3 days

wSpec1=Window.partitionBy('var1').orderBy('dayssinceJan11900').rangeBetween(-3 ,-1)

df=(df.withColumn("events_past_3days", F.count(df.var2).over(wSpec1)))
df.show()




+----------+----+----+-----------------+-----------------+
|      date|var1|var2|dayssinceJan11900|events_past_3days|
+----------+----+----+-----------------+-----------------+
|2017-01-29| 456|   A|            42762|                0|
|2017-01-19| 789|   A|            42752|                0|
|2017-01-20| 789|   A|            42753|                1|
|2017-01-21| 789|   B|            42754|                2|
|2017-01-22| 789|   B|            42755|                3|
|2017-01-11| 123|   B|            42744|                0|
|2017-01-14| 123|   A|            42747|                1|
|2017-01-15| 123|   A|            42748|                1|
|2017-01-15| 123|   A|            42748|                1|
|2017-01-17| 123|   B|            42750|                3|
|2017-01-30| 123|   A|            42763|                0|
+----------+----+----+-----------------+-----------------+



In [70]:
df=createtestdate()

#count number of times each user had event in past 3 days of type A
df=(df.withColumn("A_events_past_3days", F.sum((df.var2==('A')).cast(IntegerType())).over(wSpec1)))
df.show()

#null is returned when there are no records prior (in range) to a row
df=df.fillna(0,'A_events_past_3days')
df.show()

+----------+----+----+-----------------+
|      date|var1|var2|dayssinceJan11900|
+----------+----+----+-----------------+
|2017-01-30| 123|   A|            42763|
|2017-01-17| 123|   B|            42750|
|2017-01-15| 123|   A|            42748|
|2017-01-15| 123|   A|            42748|
|2017-01-14| 123|   A|            42747|
|2017-01-11| 123|   B|            42744|
|2017-01-29| 456|   A|            42762|
|2017-01-22| 789|   B|            42755|
|2017-01-21| 789|   B|            42754|
|2017-01-20| 789|   A|            42753|
+----------+----+----+-----------------+
only showing top 10 rows

+----------+----+----+-----------------+-------------------+
|      date|var1|var2|dayssinceJan11900|A_events_past_3days|
+----------+----+----+-----------------+-------------------+
|2017-01-29| 456|   A|            42762|               null|
|2017-01-19| 789|   A|            42752|               null|
|2017-01-20| 789|   A|            42753|                  1|
|2017-01-21| 789|   B|            

In [71]:
df=createtestdate()

#days since last event for each user / date 

wSpec1=Window.partitionBy('var1').orderBy('dayssinceJan11900').rowsBetween(Window.unboundedPreceding,-1)

df=(df.withColumn("dateofLastEvent", F.max(df.date).over(wSpec1)))
df.show()

#now would do a days_between or months_between etc
df=(df.withColumn("daysSinceLastEvent", F.datediff(df.date,df.dateofLastEvent)))
df.show()


+----------+----+----+-----------------+
|      date|var1|var2|dayssinceJan11900|
+----------+----+----+-----------------+
|2017-01-30| 123|   A|            42763|
|2017-01-17| 123|   B|            42750|
|2017-01-15| 123|   A|            42748|
|2017-01-15| 123|   A|            42748|
|2017-01-14| 123|   A|            42747|
|2017-01-11| 123|   B|            42744|
|2017-01-29| 456|   A|            42762|
|2017-01-22| 789|   B|            42755|
|2017-01-21| 789|   B|            42754|
|2017-01-20| 789|   A|            42753|
+----------+----+----+-----------------+
only showing top 10 rows

+----------+----+----+-----------------+---------------+
|      date|var1|var2|dayssinceJan11900|dateofLastEvent|
+----------+----+----+-----------------+---------------+
|2017-01-29| 456|   A|            42762|           null|
|2017-01-19| 789|   A|            42752|           null|
|2017-01-20| 789|   A|            42753|     2017-01-19|
|2017-01-21| 789|   B|            42754|     2017-01-20|
|

In [81]:
df=createtestdate()

#count number of [distinct] var2 each user had event in past 3 days (countDistinct didnt work)

#notice we are taking F.size over the over()

#you can use collect_list and collect_set to get a list of the values in the range

wSpec1=Window.partitionBy('var1').orderBy('dayssinceJan11900').rangeBetween(-3 ,-1)

#keept the set to see what is there
df=(df.withColumn("events_past_3days_set", F.collect_set(df.var2).over(wSpec1)))

#keept the set to see what is there
df=(df.withColumn("events_past_3days_list", F.collect_list(df.var2).over(wSpec1)))
df.show()

print(df.dtypes)




+----------+----+----+-----------------+
|      date|var1|var2|dayssinceJan11900|
+----------+----+----+-----------------+
|2017-01-30| 123|   A|            42763|
|2017-01-17| 123|   B|            42750|
|2017-01-15| 123|   A|            42748|
|2017-01-15| 123|   A|            42748|
|2017-01-14| 123|   A|            42747|
|2017-01-11| 123|   B|            42744|
|2017-01-29| 456|   A|            42762|
|2017-01-22| 789|   B|            42755|
|2017-01-21| 789|   B|            42754|
|2017-01-20| 789|   A|            42753|
+----------+----+----+-----------------+
only showing top 10 rows

+----------+----+----+-----------------+---------------------+----------------------+
|      date|var1|var2|dayssinceJan11900|events_past_3days_set|events_past_3days_list|
+----------+----+----+-----------------+---------------------+----------------------+
|2017-01-29| 456|   A|            42762|                   []|                    []|
|2017-01-19| 789|   A|            42752|                

In [79]:
df=createtestdate()

#distinct events with set
df=(df.withColumn("distinctevents_past_3days", F.size(F.collect_set(df.var2).over(wSpec1))))

#list
df=(df.withColumn("nondistinctevents_past_3days", F.size(F.collect_list(df.var2).over(wSpec1))))
df.show()

+----------+----+----+-----------------+
|      date|var1|var2|dayssinceJan11900|
+----------+----+----+-----------------+
|2017-01-30| 123|   A|            42763|
|2017-01-17| 123|   B|            42750|
|2017-01-15| 123|   A|            42748|
|2017-01-15| 123|   A|            42748|
|2017-01-14| 123|   A|            42747|
|2017-01-11| 123|   B|            42744|
|2017-01-29| 456|   A|            42762|
|2017-01-22| 789|   B|            42755|
|2017-01-21| 789|   B|            42754|
|2017-01-20| 789|   A|            42753|
+----------+----+----+-----------------+
only showing top 10 rows

+----------+----+----+-----------------+-------------------------+----------------------------+
|      date|var1|var2|dayssinceJan11900|distinctevents_past_3days|nondistinctevents_past_3days|
+----------+----+----+-----------------+-------------------------+----------------------------+
|2017-01-29| 456|   A|            42762|                        0|                           0|
|2017-01-19| 789

In [92]:
#can we access the lists or sets?
df=createtestdate()

wSpec1=Window.partitionBy('var1').orderBy('dayssinceJan11900').rangeBetween(-3 ,-1)

#keept the set to see what is there
df=(df.withColumn("events_past_3days_list", F.collect_list(df.var2).over(wSpec1)))

df.show()

#pull out the first item 
df.select(df.events_past_3days_list[0]).show()

#I assume you can send this to a udf and do anything you normally could in python which opens up lots of powerful options

+----------+----+----+-----------------+
|      date|var1|var2|dayssinceJan11900|
+----------+----+----+-----------------+
|2017-01-30| 123|   A|            42763|
|2017-01-17| 123|   B|            42750|
|2017-01-15| 123|   A|            42748|
|2017-01-15| 123|   A|            42748|
|2017-01-14| 123|   A|            42747|
|2017-01-11| 123|   B|            42744|
|2017-01-29| 456|   A|            42762|
|2017-01-22| 789|   B|            42755|
|2017-01-21| 789|   B|            42754|
|2017-01-20| 789|   A|            42753|
+----------+----+----+-----------------+
only showing top 10 rows

+----------+----+----+-----------------+----------------------+
|      date|var1|var2|dayssinceJan11900|events_past_3days_list|
+----------+----+----+-----------------+----------------------+
|2017-01-29| 456|   A|            42762|                    []|
|2017-01-19| 789|   A|            42752|                    []|
|2017-01-20| 789|   A|            42753|                   [A]|
|2017-01-21| 789

In [3]:
#simple rank (denserank will break ties)
df=createtestdate()

wSpec1=Window.partitionBy('var1').orderBy('date')

#keept the set to see what is there
df=(df.withColumn("rank_var1_by_date", rank().over(wSpec1)))

df.show()


+----------+----+----+-----------------+
|      date|var1|var2|dayssinceJan11900|
+----------+----+----+-----------------+
|2017-01-30| 123|   A|            42763|
|2017-01-17| 123|   B|            42750|
|2017-01-15| 123|   A|            42748|
|2017-01-15| 123|   A|            42748|
|2017-01-14| 123|   A|            42747|
|2017-01-11| 123|   B|            42744|
|2017-01-29| 456|   A|            42762|
|2017-01-22| 789|   B|            42755|
|2017-01-21| 789|   B|            42754|
|2017-01-20| 789|   A|            42753|
+----------+----+----+-----------------+
only showing top 10 rows

+----------+----+----+-----------------+-----------------+
|      date|var1|var2|dayssinceJan11900|rank_var1_by_date|
+----------+----+----+-----------------+-----------------+
|2017-01-29| 456|   A|            42762|                1|
|2017-01-19| 789|   A|            42752|                1|
|2017-01-20| 789|   A|            42753|                2|
|2017-01-21| 789|   B|            42754|      

In [7]:
#first in a group
df=createtestdate()

wSpec1=Window.partitionBy('var1').orderBy('date')

#keept the set to see what is there
df=(df.withColumn("first_var2_by_date", first('var2').over(wSpec1)))

df.show()


+----------+----+----+-----------------+
|      date|var1|var2|dayssinceJan11900|
+----------+----+----+-----------------+
|2017-01-30| 123|   A|            42763|
|2017-01-17| 123|   B|            42750|
|2017-01-15| 123|   A|            42748|
|2017-01-15| 123|   A|            42748|
|2017-01-14| 123|   A|            42747|
|2017-01-11| 123|   B|            42744|
|2017-01-29| 456|   A|            42762|
|2017-01-22| 789|   B|            42755|
|2017-01-21| 789|   B|            42754|
|2017-01-20| 789|   A|            42753|
+----------+----+----+-----------------+
only showing top 10 rows

+----------+----+----+-----------------+------------------+
|      date|var1|var2|dayssinceJan11900|first_var2_by_date|
+----------+----+----+-----------------+------------------+
|2017-01-29| 456|   A|            42762|                 A|
|2017-01-19| 789|   A|            42752|                 A|
|2017-01-20| 789|   A|            42753|                 A|
|2017-01-21| 789|   B|            42754|

In [6]:
#using window in a groupby to count rows within 7 day intervals
#it appears from this that the interval is (]

df.groupBy(window(col("date"),"7 day")).agg(count("dayssinceJan11900")).toPandas()

Unnamed: 0,window,count(dayssinceJan11900)
0,"(2017-01-11 19:00:00, 2017-01-18 19:00:00)",4
1,"(2017-01-04 19:00:00, 2017-01-11 19:00:00)",1
2,"(2017-01-18 19:00:00, 2017-01-25 19:00:00)",4
3,"(2017-01-25 19:00:00, 2017-02-01 19:00:00)",2
