### Setup (only required for the first run on the Spark cluster)

In [1]:
# !pip install pandas

In [1]:
ROOT_PATH = '/data/tungtv/Code/dataset/dataset_cafebiz_full_45/'
#TODO: Upload this file (generated by the ACR module training) to GCS before calling spark script
# !gsutil cp {ROOT_PATH}/adressa_articles.csv.

### Loading dependencies

In [2]:
import os
import json
import pandas as pd
import pickle
import datetime
import hashlib
import math
import matplotlib
%matplotlib inline

In [3]:
import findspark
findspark.init()
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import SparkContext,SparkConf,SQLContext
from pyspark.sql import SparkSession
# from pyspark.sql.functions import pandas_udf
# from pyspark.sql.functions import PandasUDFType
from pyspark.sql.functions import *

In [4]:
spark = SparkSession.builder.master('local[*]').appName('myAppName')\
.config("spark.local.dir","/data/tungtv/tmp/").getOrCreate()

In [5]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [6]:
print(spark.version)

2.2.0


## Loading articles pre-processed

In [7]:
articles_original_df = pd.read_csv('/data/tungtv/Code/dataset/dataset_cafebiz_full_45/cafebiz_articles.csv')

In [8]:
articles_original_df.columns

Index(['id', 'content', 'created_at_ts', 'teaser', 'domain', 'keywords',
       'title', 'url', 'category0', 'persons', 'locations', 'text_highlights',
       'id_encoded', 'category0_encoded', 'keywords_encoded',
       'locations_encoded', 'persons_encoded'],
      dtype='object')

In [9]:
articles_original_df['url'][0]

'http://cafebiz.vn/gia-dinh-ty-phu-gop-113-trieu-usd-khoi-phuc-nha-tho-duc-ba-paris-giau-co-va-quyen-luc-co-nao-20190416190831.chn'

In [10]:
valid_articles_urls_to_ids_dict = dict(articles_original_df[['url','id_encoded']].apply(lambda x: (x['url'], x['id_encoded']), axis=1).values)
valid_articles_urls_to_ids_dict

{'http://cafebiz.vn/gia-dinh-ty-phu-gop-113-trieu-usd-khoi-phuc-nha-tho-duc-ba-paris-giau-co-va-quyen-luc-co-nao-20190416190831.chn': 1,
 'http://cafebiz.vn/4-cong-trinh-lich-su-tai-chau-au-tung-bi-quy-lua-tan-cong-nhu-nha-tho-duc-ba-paris-20190416191427.chn': 2,
 'http://cafebiz.vn/tot-nghiep-dh-roi-chay-grab-2-nam-chang-trai-gay-tranh-cai-vi-quan-diem-ha-noi-khong-danh-cho-nhung-ke-sinh-ra-tu-lang-nhu-chung-ta-20190420083631.chn': 3,
 'http://cafebiz.vn/thanh-nien-30-tuoi-con-an-bam-bo-me-song-nho-tro-cap-that-nghiep-tro-thanh-trieu-phu-cha-de-cua-tua-game-pubg-ty-do-20190429095142.chn': 4,
 'http://cafebiz.vn/steve-jobs-co-1-thoi-quen-ma-bat-cu-nguoi-thanh-cong-nao-cung-thuong-lam-nghe-qua-ai-cung-tuong-de-nhung-chi-khi-bat-dau-moi-thay-kho-vo-cung-20190429154619.chn': 5,
 'http://cafebiz.vn/gia-bitcoin-tang-vot-20190503135004.chn': 6,
 'http://cafebiz.vn/chiem-nguong-ca-tram-ngoi-nha-duoc-xay-noi-tren-mat-nuoc-quan-the-kien-truc-dang-tu-hao-cua-amsterdam-20190507134028.chn': 7,
 'h

In [11]:
for key, valu in valid_articles_urls_to_ids_dict.items():    # for name, age in dictionary.iteritems():  (for Python 2.x)
    if valu == 1516:
        print(key)

http://cafebiz.vn/kinh-te-hoc-hon-nhan-vi-sao-phu-nu-it-duoc-yeu-hon-sau-khi-cuoi-20190413113206854.chn


In [12]:
len(valid_articles_urls_to_ids_dict)

5194

In [13]:
len(articles_original_df['id'].unique())

5194

In [14]:
articles_original_df['id'][0]

20190416190831

### Start Loading user interactions

In [15]:
df =  spark.read.parquet("/data/tungtv/jupytercode/data-log-news-parquet-thang45/") 

In [16]:
# df.count()

In [17]:
# df.printSchema()

In [18]:
# df.select("url").distinct().count()

In [19]:
df.printSchema()

root
 |-- full_url: string (nullable = true)
 |-- dt: string (nullable = true)
 |-- cookietime: string (nullable = true)
 |-- browser_code: integer (nullable = true)
 |-- browser_ver: string (nullable = true)
 |-- os_code: integer (nullable = true)
 |-- os_version: string (nullable = true)
 |-- ip: long (nullable = true)
 |-- loc_id: integer (nullable = true)
 |-- domain: string (nullable = true)
 |-- path: string (nullable = true)
 |-- referer: string (nullable = true)
 |-- guid: string (nullable = true)
 |-- pageloadId: string (nullable = true)
 |-- screen: string (nullable = true)
 |-- d_guid: string (nullable = true)
 |-- category: string (nullable = true)
 |-- utm_source: string (nullable = true)
 |-- utm_campaign: string (nullable = true)
 |-- utm_medium: string (nullable = true)
 |-- milis: long (nullable = true)
 |-- tos: long (nullable = true)
 |-- tor: long (nullable = true)
 |-- top: long (nullable = true)
 |-- scrollEnd: integer (nullable = true)
 |-- pageLoadTime: long (nu

In [20]:
# df.select("loc_id").where("loc_id = -1").count()

### End Loading user interactions

In [21]:
interactions_df = df.select("full_url","dt","os_code","loc_id","path","referer"\
                            ,"guid","category","devCode","id","content","created_at_ts"\
                            ,"teaser","title","keywords","time","url", "top")

In [22]:
# interactions_df.select("url").show(1,False)

In [23]:
interactions_df.printSchema()

root
 |-- full_url: string (nullable = true)
 |-- dt: string (nullable = true)
 |-- os_code: integer (nullable = true)
 |-- loc_id: integer (nullable = true)
 |-- path: string (nullable = true)
 |-- referer: string (nullable = true)
 |-- guid: string (nullable = true)
 |-- category: string (nullable = true)
 |-- devCode: string (nullable = true)
 |-- id: long (nullable = true)
 |-- content: string (nullable = true)
 |-- created_at_ts: long (nullable = true)
 |-- teaser: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- time: long (nullable = true)
 |-- url: string (nullable = true)
 |-- top: long (nullable = true)



In [24]:
# interactions_df.count()

In [25]:
# interactions_df.select("id","guid","url").show(10, False)

In [26]:
#Retrives article id from its cannonical URL (because sometimes article ids in interactions do no match with articles tables, but cannonical URL do)
def get_article_id_encoded_from_url(canonical_url):
    if canonical_url in valid_articles_urls_to_ids_dict:
        return valid_articles_urls_to_ids_dict[canonical_url]    
    return None

get_article_id_encoded_from_url_udf = F.udf(get_article_id_encoded_from_url, pyspark.sql.types.IntegerType())

In [27]:
#Filtering only interactions whose url/id is available in the articles table
interactions_article_id_encoded_df = interactions_df.withColumn('article_id', get_article_id_encoded_from_url_udf(interactions_df['url']))
interactions_filtered_df = interactions_article_id_encoded_df.filter(interactions_article_id_encoded_df['article_id'].isNull() == False).cache()

In [28]:
# interactions_filtered_df.select("article_id").distinct().count()

In [29]:
interactions_filtered_df.cache()

DataFrame[full_url: string, dt: string, os_code: int, loc_id: int, path: string, referer: string, guid: string, category: string, devCode: string, id: bigint, content: string, created_at_ts: bigint, teaser: string, title: string, keywords: string, time: bigint, url: string, top: bigint, article_id: int]

In [30]:
interactions_filtered_df.printSchema()

root
 |-- full_url: string (nullable = true)
 |-- dt: string (nullable = true)
 |-- os_code: integer (nullable = true)
 |-- loc_id: integer (nullable = true)
 |-- path: string (nullable = true)
 |-- referer: string (nullable = true)
 |-- guid: string (nullable = true)
 |-- category: string (nullable = true)
 |-- devCode: string (nullable = true)
 |-- id: long (nullable = true)
 |-- content: string (nullable = true)
 |-- created_at_ts: long (nullable = true)
 |-- teaser: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- time: long (nullable = true)
 |-- url: string (nullable = true)
 |-- top: long (nullable = true)
 |-- article_id: integer (nullable = true)



In [31]:
#Valid interactions
# interactions_filtered_df.count()

In [32]:
#Distinct items count
# interactions_filtered_df.select('id').distinct().count()

In [33]:
# modified
# interactions_filtered_df.type

In [34]:
def check_numm(df, name_col):
    return df.where(F.isnull(F.col(name_col))).count()

In [35]:
# check_numm(interactions_filtered_df, "time")

In [36]:
# check_numm(interactions_filtered_df, "id")

In [37]:
first_timestamp_ts = interactions_filtered_df.select('time').agg(F.min('time')).collect()[0][0] * 1000
first_timestamp_ts

1554051934000

### Analyzing elapsed time since publishing

In [38]:
# addition 
from pyspark.sql import functions as f
from pyspark.sql import types as t
interactions_filtered_df = interactions_filtered_df.withColumn("publish_ts"\
                            ,F.to_timestamp(interactions_filtered_df.created_at_ts.cast(dataType=t.TimestampType())))


In [39]:
interactions_filtered_df.printSchema()

root
 |-- full_url: string (nullable = true)
 |-- dt: string (nullable = true)
 |-- os_code: integer (nullable = true)
 |-- loc_id: integer (nullable = true)
 |-- path: string (nullable = true)
 |-- referer: string (nullable = true)
 |-- guid: string (nullable = true)
 |-- category: string (nullable = true)
 |-- devCode: string (nullable = true)
 |-- id: long (nullable = true)
 |-- content: string (nullable = true)
 |-- created_at_ts: long (nullable = true)
 |-- teaser: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- time: long (nullable = true)
 |-- url: string (nullable = true)
 |-- top: long (nullable = true)
 |-- article_id: integer (nullable = true)
 |-- publish_ts: timestamp (nullable = true)



In [39]:
# interactions_filtered_df.select("publish_ts").show(1)

+-------------------+
|         publish_ts|
+-------------------+
|2019-05-06 17:30:00|
+-------------------+
only showing top 1 row



In [40]:
interactions_filtered_df = interactions_filtered_df.withColumn("publish_ts", F.unix_timestamp(col("publish_ts"),'yyyy-MM-dd HH:mm:ss'))

In [41]:
interactions_filtered_df.printSchema()

root
 |-- full_url: string (nullable = true)
 |-- dt: string (nullable = true)
 |-- os_code: integer (nullable = true)
 |-- loc_id: integer (nullable = true)
 |-- path: string (nullable = true)
 |-- referer: string (nullable = true)
 |-- guid: string (nullable = true)
 |-- category: string (nullable = true)
 |-- devCode: string (nullable = true)
 |-- id: long (nullable = true)
 |-- content: string (nullable = true)
 |-- created_at_ts: long (nullable = true)
 |-- teaser: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- time: long (nullable = true)
 |-- url: string (nullable = true)
 |-- top: long (nullable = true)
 |-- article_id: integer (nullable = true)
 |-- publish_ts: long (nullable = true)



In [42]:
#interactions_filtered_df.filter(interactions_filtered_df['time'].isNull()).count()
#0

In [43]:
# interactions_filtered_with_publish_ts_df = interactions_filtered_df.withColumn('publish_ts', get_timestamp_from_date_str_udf(interactions_filtered_df['publishtime']))
# interactions_filtered_with_publish_ts_df = interactions_filtered_with_publish_ts_df.withColumn('elapsed_min_since_published', ((F.col('time') - F.col('publish_ts')) / 60).cast(pyspark.sql.types.IntegerType()))

In [44]:
 interactions_filtered_with_publish_ts_df = interactions_filtered_df.withColumn('elapsed_min_since_published',\
                        ((F.col('time') - F.col('publish_ts')) / 60).cast(pyspark.sql.types.IntegerType()))

In [45]:
interactions_filtered_with_publish_ts_df.select('elapsed_min_since_published').show(4)

+---------------------------+
|elapsed_min_since_published|
+---------------------------+
|                      24339|
|                       9713|
|                       9630|
|                      10384|
+---------------------------+
only showing top 4 rows



In [46]:
#interactions_filtered_with_publish_ts_df.select('publishtime','publish_ts', 'time', 'elapsed_min_since_published').show(100)

In [47]:
%%time
interactions_filtered_with_publish_ts_df.approxQuantile("elapsed_min_since_published", [0.10, 0.25, 0.50, 0.75, 0.90], 0.01)
#[49.0, 108.0, 334.0, 1020.0, 4611.0]

CPU times: user 11 ms, sys: 6.96 ms, total: 18 ms
Wall time: 2.31 s


[-366.0, -219.0, 504.0, 2186.0, 3710.0]

In [48]:
elapsed_min_since_published_df = interactions_filtered_with_publish_ts_df.select('elapsed_min_since_published').toPandas()
print(len(elapsed_min_since_published_df[pd.isnull(elapsed_min_since_published_df['elapsed_min_since_published'])]))
elapsed_min_since_published_df.describe()

0


Unnamed: 0,elapsed_min_since_published
count,6409658.0
mean,2041.369
std,6369.303
min,-54204.0
25%,-232.0
50%,508.0
75%,2174.0
max,86814.0


In [49]:

'''
elapsed_min_since_published
count	2.600818e+06
mean	6.438622e+04
std	5.051825e+05
min	-3.151590e+05
25%	9.400000e+01
50%	2.580000e+02
75%	8.370000e+02
max	8.608278e+06
'''

'\nelapsed_min_since_published\ncount\t2.600818e+06\nmean\t6.438622e+04\nstd\t5.051825e+05\nmin\t-3.151590e+05\n25%\t9.400000e+01\n50%\t2.580000e+02\n75%\t8.370000e+02\nmax\t8.608278e+06\n'

### Analyzing clicks by article distribution

In [50]:
#clicks_by_article_count_df = interactions_filtered_df.groupBy('article_id').count()
#clicks_by_article_count_df.approxQuantile("count", [0.01, 0.10, 0.25, 0.50, 0.75, 0.90, 0.99], 0.01)
#[1.0, 1.0, 1.0, 1.0, 2.0, 6.0, 33581.0]

### Processing categorical features

In [51]:
def get_categ_features_counts_dataframe(interactions_spark_df,column_name):
    df_pandas = interactions_spark_df.groupBy(column_name).count().toPandas().sort_values('count', ascending=False)
    return df_pandas

In [52]:
PAD_TOKEN = '<PAD>'
UNFREQ_TOKEN = '<UNF>'

def get_encoder_for_values(values):
    encoder_values = [PAD_TOKEN, UNFREQ_TOKEN] + values
    encoder_ids = list(range(len(encoder_values)))
    encoder_dict = dict(zip(encoder_values, encoder_ids))
    return encoder_dict

def get_categ_features_encoder_dict(counts_df, min_freq=100):
    freq_values = counts_df[counts_df['count'] >= 100][counts_df.columns[0]].values.tolist()
    encoder_dict = get_encoder_for_values(freq_values)
    return encoder_dict

def encode_cat_feature(value, encoder_dict):
    if value in encoder_dict:
        return encoder_dict[value]
    else:
        return encoder_dict[UNFREQ_TOKEN]

In [53]:
# countries_df = get_categ_features_counts_dataframe(interactions_filtered_df, 'country')
# len(countries_df)

In [54]:
# countries_encoder_dict = get_categ_features_encoder_dict(countries_df)
# len(countries_encoder_dict)

In [55]:
cities_df = get_categ_features_counts_dataframe(interactions_filtered_df, 'loc_id')
len(cities_df)

223

In [56]:
cities_encoder_dict = get_categ_features_encoder_dict(cities_df)
len(cities_encoder_dict)

135

In [58]:
# regions_df = get_categ_features_counts_dataframe(interactions_filtered_df, 'region')
# len(regions_df)

In [59]:
# regions_encoder_dict = get_categ_features_encoder_dict(regions_df)
# len(regions_encoder_dict)

In [60]:
# devices_df = get_categ_features_counts_dataframe(interactions_filtered_df, 'devCode')
# print(len(devices_df))
# devices_df

In [61]:
# devices_encoder_dict = get_categ_features_encoder_dict(devices_df)
# len(devices_encoder_dict)

In [62]:
os_df = get_categ_features_counts_dataframe(interactions_filtered_df, 'os_code')
print(len(os_df))
os_df

7


Unnamed: 0,os_code,count
5,10,5431067
6,2,436473
4,8,375549
3,9,106133
0,1,58091
2,5,2061
1,3,284


In [63]:
os_encoder_dict = get_categ_features_encoder_dict(os_df)
len(os_encoder_dict)

9

In [64]:
# referrer_class_df = get_categ_features_counts_dataframe(interactions_filtered_df, 'referrerHostClass')
# print(len(referrer_class_df))
# referrer_class_df

In [65]:
# referrer_class_encoder_dict = get_categ_features_encoder_dict(referrer_class_df)
# len(referrer_class_encoder_dict)

In [66]:
encoders_dict = {
    'city': cities_encoder_dict,
#     'region': regions_encoder_dict,
#     'country': countries_encoder_dict,
    'os': os_encoder_dict,
#     'device': devices_encoder_dict,
#     'referrer_class': referrer_class_encoder_dict
}

### Processing numeric features

In [67]:
%%time
active_time_quantiles = interactions_filtered_df.approxQuantile("top", [0.10, 0.25, 0.50, 0.75, 0.90], 0.01)
print(active_time_quantiles)

[0.0, 13.0, 62.0, 168.0, 393.0]
CPU times: user 8.11 ms, sys: 12.1 ms, total: 20.2 ms
Wall time: 805 ms


In [68]:
active_time_stats_df = interactions_filtered_df.describe('top').toPandas()
active_time_stats_df

Unnamed: 0,summary,top
0,count,6409658.0
1,mean,227.3538165374814
2,stddev,1178.4637884787269
3,min,0.0
4,max,85473.0


In [69]:
active_time_mean = float(active_time_stats_df[active_time_stats_df['summary'] == 'mean']['top'].values[0])
active_time_stddev = float(active_time_stats_df[active_time_stats_df['summary'] == 'stddev']['top'].values[0])

In [70]:
interactions_filtered_df.printSchema()

root
 |-- full_url: string (nullable = true)
 |-- dt: string (nullable = true)
 |-- os_code: integer (nullable = true)
 |-- loc_id: integer (nullable = true)
 |-- path: string (nullable = true)
 |-- referer: string (nullable = true)
 |-- guid: string (nullable = true)
 |-- category: string (nullable = true)
 |-- devCode: string (nullable = true)
 |-- id: long (nullable = true)
 |-- content: string (nullable = true)
 |-- created_at_ts: long (nullable = true)
 |-- teaser: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- time: long (nullable = true)
 |-- url: string (nullable = true)
 |-- top: long (nullable = true)
 |-- article_id: integer (nullable = true)
 |-- publish_ts: long (nullable = true)



In [71]:
interactions_filtered_df.select("guid").show(1)

+-------------------+
|               guid|
+-------------------+
|8830407671934495734|
+-------------------+
only showing top 1 row



In [72]:
interactions_filtered_df = interactions_filtered_df.withColumnRenamed("guid", "user_id")

In [73]:
interactions_filtered_df.printSchema()

root
 |-- full_url: string (nullable = true)
 |-- dt: string (nullable = true)
 |-- os_code: integer (nullable = true)
 |-- loc_id: integer (nullable = true)
 |-- path: string (nullable = true)
 |-- referer: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- devCode: string (nullable = true)
 |-- id: long (nullable = true)
 |-- content: string (nullable = true)
 |-- created_at_ts: long (nullable = true)
 |-- teaser: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- time: long (nullable = true)
 |-- url: string (nullable = true)
 |-- top: long (nullable = true)
 |-- article_id: integer (nullable = true)
 |-- publish_ts: long (nullable = true)



### Splitting sessions

In [74]:
'''
schema = T.StructType([
    T.StructField("userId", T.StringType()),
    T.StructField("min_ts", T.IntegerType())
])

@pandas_udf(schema, functionType=PandasUDFType.GROUPED_MAP)
def split_sessions(df):

    result_df = df[['userId']]
    result_df['min_ts'] = df['time'].min()
    
    return result

%%time
tmp = interactions_filtered_df.groupBy('userId').apply(split_sessions)
tmp.show(100)
'''

'\nschema = T.StructType([\n    T.StructField("userId", T.StringType()),\n    T.StructField("min_ts", T.IntegerType())\n])\n\n@pandas_udf(schema, functionType=PandasUDFType.GROUPED_MAP)\ndef split_sessions(df):\n\n    result_df = df[[\'userId\']]\n    result_df[\'min_ts\'] = df[\'time\'].min()\n    \n    return result\n\n%%time\ntmp = interactions_filtered_df.groupBy(\'userId\').apply(split_sessions)\ntmp.show(100)\n'

In [75]:
def hash_str_to_int(encoded_bytes_text, digits):
    return int(str(int(hashlib.md5(encoded_bytes_text).hexdigest()[:8], 16))[:digits])      

In [76]:
MAX_SESSION_IDLE_TIME_MS = 30 * 60 * 1000    #30 min

def close_session(session):
    size = len(session)
    
    #Creating and artificial session id based on the first click timestamp and a hash of user id
    first_click = session[0]
    session_id = (int(first_click['timestamp']) * 100) + hash_str_to_int(first_click['user_id'].encode(), 3)
    session_hour = int((first_click['timestamp'] - first_timestamp_ts) / (1000 * 60 * 60)) #Converting timestamp to hours since first timestamp
    
    #Converting to Spark DataFrame Rows, to convert RDD back to DataFrame
    #TODO add 'view' here
    clicks = list([T.Row(**click) for click in session])
    session_dict = {'session_id': session_id,
                    'session_hour': session_hour,
                    'session_size': size,
                    'session_start': first_click['timestamp'],
                    'user_id': first_click['user_id'],
                    'clicks': clicks 
                   }
    session_row = T.Row(**session_dict)
    
    return session_row
        
def transform_interaction(interaction):        
    return {
            'article_id': interaction['article_id'],
            'url': interaction['full_url'],
            'user_id': interaction['user_id'],
            'timestamp': interaction['time'] * 1000, #converting to timestamp
            'active_time_secs': interaction['top'],
#             'country': encode_cat_feature(interaction['country'], encoders_dict['country']),
#             'region': encode_cat_feature(interaction['region'], encoders_dict['region']),
            'city': encode_cat_feature(interaction['loc_id'], encoders_dict['city']),
            'os': encode_cat_feature(interaction['os_code'], encoders_dict['os']),
#             'device': encode_cat_feature(interaction['deviceType'], encoders_dict['device']),
#             'referrer_class': encode_cat_feature(interaction['referrerHostClass'], encoders_dict['referrer_class']),
           }

def split_sessions(group):
    user, interactions = group
    #Ensuring items are sorted by time
    interactions_sorted_by_time = sorted(interactions, key=lambda x: x['time'])
    #Transforming interactions
    interactions_transformed = list(map(transform_interaction, interactions_sorted_by_time))

    
    sessions = []
    session = []        
    first_timestamp = interactions_transformed[0]['timestamp']
    last_timestamp = first_timestamp    
    for interaction in interactions_transformed:
        
        delta_ms = (interaction['timestamp'] - last_timestamp)
        interaction['_elapsed_ms_since_last_click'] = delta_ms 

        if delta_ms <= MAX_SESSION_IDLE_TIME_MS:    
            #Ignoring repeated items in session
            if len(list(filter(lambda x: x['article_id'] == interaction['article_id'], session))) == 0:        
                session.append(interaction)            
        else:
            #If session have at least 2 clicks (minimum for next click predicition)
            if len(session) >= 2:
                session_row = close_session(session)
                sessions.append(session_row)                
            session = [interaction]

        last_timestamp = interaction['timestamp']
            
    if len(session) >= 2:
        session_row = close_session(session)
        sessions.append(session_row)
        
    #if len(sessions) > 1:
    #    raise Exception('USER with more than one session: {}'.format(user))
    
    return list(zip(map(lambda x: x['session_id'], sessions), 
                    sessions))

In [77]:
'''
#To debug
%%time
sessions_rdd = interactions_filtered_df.limit(1000).rdd.map(lambda x: (x['userId'], x)).groupByKey() \
                    .collect()

for row in sessions_rdd:
    print(split_sessions(row))
    print()
'''

"\n#To debug\n%%time\nsessions_rdd = interactions_filtered_df.limit(1000).rdd.map(lambda x: (x['userId'], x)).groupByKey()                     .collect()\n\nfor row in sessions_rdd:\n    print(split_sessions(row))\n    print()\n"

In [78]:
interactions_filtered_df.cache()

DataFrame[full_url: string, dt: string, os_code: int, loc_id: int, path: string, referer: string, user_id: string, category: string, devCode: string, id: bigint, content: string, created_at_ts: bigint, teaser: string, title: string, keywords: string, time: bigint, url: string, top: bigint, article_id: int, publish_ts: bigint]

In [None]:
# interactions_filtered_df.select("guid").show(1)

In [None]:
# strn = "7508097301953125944"
# hash_str_to_int(strn.encode(), 3)

In [None]:
# interactions_filtered_df.printSchema()

In [None]:
# interactions_filtered_df.select("guid").show()

In [79]:
interactions_filtered_df = interactions_filtered_df.orderBy("time")

In [80]:
# interactions_filtered_df.select("time").show(5)

+----------+
|      time|
+----------+
|1554051934|
|1554052612|
|1554054411|
|1554054481|
|1554054967|
+----------+
only showing top 5 rows



In [81]:
%%time
sessions_rdd = interactions_filtered_df.rdd.map(lambda x: (x['user_id'], x)).groupByKey() \
                            .flatMap(split_sessions) \
                            .sortByKey() \
                            .map(lambda x: x[1])

CPU times: user 196 ms, sys: 39.6 ms, total: 236 ms
Wall time: 9min 22s


In [None]:
# sessions_rdd

#### Exporting sessions to JSON lines

In [None]:
# sessions_rdd.toDF().select("session_hour").orderBy("session_hour").distinct().show()

In [82]:
sessions_sdf = sessions_rdd.toDF()

In [84]:
%%time
sessions_sdf.write.partitionBy("session_hour").json(os.path.join(ROOT_PATH,"sessions_processed_by_spark/"))

CPU times: user 33.2 ms, sys: 6.36 ms, total: 39.5 ms
Wall time: 7.03 s


In [None]:
sessions_sdf.count()

In [85]:
def serialize(filename, obj):
    with open(filename, 'wb') as handle:
        pickle.dump(obj, handle)#, protocol=pickle.HIGHEST_PROTOCOL)

In [86]:
NAR_ENCODERS_PATH  = 'nar_encoders_cafebiz.pickle'
serialize(NAR_ENCODERS_PATH, encoders_dict)

In [87]:
!cp {NAR_ENCODERS_PATH} {ROOT_PATH}/pickles/