Create the cassandra _clicklog_ keyspace and _userclicksperday_ table:

```
CREATE KEYSPACE clicklog
WITH replication = {'class':'SimpleStrategy', 'replication_factor' : 1};

USE clicklog;
CREATE TABLE userclicksperday(
   date text,
   user_id int,
   time int,
   hotel int,
   PRIMARY KEY ((date), user_id, time)
   );
```

In [1]:
import click_receiver
from click_receiver import ClickReceiver
from pyspark.sql import SQLContext
from pyspark.sql.functions import from_unixtime
import os
import avro
import avro.schema
from datetime import date, timedelta

os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka_2.11:1.6.1,com.datastax.spark:spark-cassandra-connector_2.11:1.6.0-M2 pyspark-shell'

In [2]:
ACTION_SEARCH = 1
ACTION_FILTER = 2
ACTION_CLICK = 3
duration = 30

clickreceiver = ClickReceiver("clicklog", duration)

In [3]:
def print_top10(rdd):
    """Print an array with the 10 first elements of a rdd"""
    print rdd.take(10)

In [4]:
def get_top10(rdd):
    """Get the top10 searched locations for the last 10 minutes"""
    window_length = 30
    sliding_interval = duration
    
    rdd.filter(lambda x: x.action == ACTION_SEARCH) \
       .map(lambda x: (x.destination, 1)) \
       .reduceByKeyAndWindow(lambda x, y: x + y,
                             lambda x, y: x - y,
                             window_length, sliding_interval) \
       .map(lambda (a, b): (b, a)) \
       .transform(lambda rdd: rdd.sortByKey(ascending=False)) \
       .map(lambda (b, a): {"destination": a, "search_count": b}) \
       .foreachRDD(print_top10)

In [5]:
def save_to_cassandra_with_date(rdd):
    if not rdd.isEmpty(): 
        df = click_receiver.sqlContext.createDataFrame(rdd)
        to_pattern = 'yyyy-MM-dd'
        selected_df = df.select(from_unixtime(df.time, to_pattern).alias("date"),
                                df.user_id,
                                df.time,
                                df.hotel)
        selected_df.write \
                   .format("org.apache.spark.sql.cassandra")\
                   .mode('append')\
                   .options(table="userclicksperday", keyspace="clicklog")\
                   .save()
                
#         print selected_df.show()

In [6]:
def save_stream_to_cassandra(rdd):
    """Save the data stream to cassandra for click messages"""
    rdd.filter(lambda x: x.action == ACTION_CLICK) \
       .foreachRDD(save_to_cassandra_with_date)
       

In [7]:
clickreceiver.setup([get_top10, save_stream_to_cassandra])

In [8]:
clickreceiver.start()

In [None]:
clickreceiver.ssc.stop(stopSparkContext=False,stopGraceFully=True)

Read data from cassandra
---

In [3]:
df_cassandra = click_receiver.sqlContext.read \
                             .format("org.apache.spark.sql.cassandra") \
                             .options(table="userclicksperday", keyspace="clicklog") \
                             .load()
            
# Due to the cassandra/spark bug https://github.com/holdenk/spark-testing-base/issues/101 
# its not possible to query by the rowkey using equals comparation
# the solution to query by yesterday is to query by date higher than before yesterday 
# and lower than today
before_yesterday_str = str(date.today() - timedelta(2))
today_str = str(date.today())
tomorrow_str = str(date.today() + timedelta(1))
out = df_cassandra.where((df_cassandra.date > before_yesterday_str) & 
                         (df_cassandra.date < tomorrow_str)) \
                  .select(df_cassandra.user_id, df_cassandra.hotel) \
                  .dropDuplicates(['user_id', 'hotel']) \
                  .rdd.map(lambda x: (x.user_id, x.hotel)) \
                  .groupByKey() \
                  .mapValues(lambda x: list(x)[1] if len(list(x)) > 1 else None )
out.collect()

[(0, 949),
 (1, 910),
 (2, 476),
 (3, 840),
 (4, 999),
 (5, 964),
 (6, 526),
 (7, 689),
 (8, 253),
 (9, 417),
 (10, 579),
 (11, 542),
 (12, 903),
 (13, 468),
 (14, 30),
 (15, 794),
 (16, 158),
 (17, 721),
 (18, 481),
 (19, 646),
 (20, 608),
 (21, 971),
 (22, 740),
 (23, 297),
 (24, 60),
 (25, 622),
 (26, 785),
 (27, 548),
 (28, 111),
 (29, 876),
 (30, 37)]