# Section 1 Initialization 

In [None]:
%%bash
hdfs dfs -copyFromLocal datasets/012020/home_ldn.csv QoE/London/London_Jan_2020/
hdfs dfs -copyFromLocal datasets/012020/home_birm.csv QoE/Birmingham/Birmingham_Jan_2020/
hdfs dfs -copyFromLocal datasets/012020/home_lpool.csv QoE/Liverpool/Liverpool_Jan_2020/

## Section 1.1 PySpark and Python modules

In [1]:
import os
os.environ["SPARK_HOME"] = '/usr/local/spark/spark-1.6.2-bin-hadoop2.6'
os.environ['PYSPARK_SUBMIT_ARGS'] = "--master local[*] --deploy-mode client --packages com.databricks:spark-csv_2.11:1.3.0 pyspark-shell"

import pyspark
from pyspark import SparkConf, SparkContext
from pyspark import sql
from pyspark.sql import HiveContext, Window, SQLContext
from pyspark.sql.types import DoubleType, StructType, StructField, StringType
from pyspark.sql.functions import col, lit, count, sum, avg, max, array

print('starting')
conf = SparkConf().setAppName('UserPerf')
sc = SparkContext(conf=conf)

# we need HiveContext to use Hive builtin functions:
# hive builtin functions : https://support.treasuredata.com/hc/en-us/articles/360001457367-Hive-Built-in-Aggregate-Functions
sqlContext = HiveContext(sc)

print('finished')

starting
finished


## Section 1.2 Define datasets we are going to use

In [2]:
# you can read .gz files directly:
# https://stackoverflow.com/questions/16302385/is-gzip-format-supported-in-spark

data_dirs = ['QoE/Liverpool/Liverpool_Jan_2020/',
             'QoE/London/London_Jan_2020/',
             'QoE/Birmingham/Birmingham_Jan_2020/']

output_files = ['Liverpool_userperf_Jan_2020',
                'London_userperf_Jan_2020',
                'Birmingham_userperf_Jan_2020']

ml_dataset

home_antenna_files = ['home_lpool.csv',
                     'home_ldn.csv',
                     'home_birm.csv']

# 1st field was changed from userid to device_id to make it easier to join with HomeAntenna_XXX_GenerationIMD files
Webanalytics_v1_schema = StructType([StructField('userid',StringType(),True)\
, StructField('rtt2g_p10',StringType(),True)\
, StructField('rtt2g_p25',StringType(),True)\
, StructField('rtt2g_p50',StringType(),True)\
, StructField('rtt2g_p75',StringType(),True)\
, StructField('rtt2g_p90',StringType(),True)\
, StructField('rtt2g_min',StringType(),True)\
, StructField('rtt2g_max',StringType(),True)\
, StructField('rtt2g_avg',StringType(),True)\
, StructField('rtt2g_stdev',StringType(),True)\
, StructField('rtt2g_samples',StringType(),True)\
, StructField('thput2g_p10',StringType(),True)\
, StructField('thput2g_p25',StringType(),True)\
, StructField('thput2g_p50',StringType(),True)\
, StructField('thput2g_p75',StringType(),True)\
, StructField('thput2g_p90',StringType(),True)\
, StructField('thput2g_min',StringType(),True)\
, StructField('thput2g_max',StringType(),True)\
, StructField('thput2g_avg',StringType(),True)\
, StructField('thput2g_stdev',StringType(),True)\
, StructField('thput2g_samples',StringType(),True)\
, StructField('retx2g_p10',StringType(),True)\
, StructField('retx2g_p25',StringType(),True)\
, StructField('retx2g_p50',StringType(),True)\
, StructField('retx2g_p75',StringType(),True)\
, StructField('retx2g_p90',StringType(),True)\
, StructField('retx2g_min',StringType(),True)\
, StructField('retx2g_max',StringType(),True)\
, StructField('retx2g_avg',StringType(),True)\
, StructField('retx2g_stdev',StringType(),True)\
, StructField('retx2g_samples',StringType(),True)\
, StructField('bytes2g_p10',StringType(),True)\
, StructField('bytes2g_p25',StringType(),True)\
, StructField('bytes2g_p50',StringType(),True)\
, StructField('bytes2g_p75',StringType(),True)\
, StructField('bytes2g_p90',StringType(),True)\
, StructField('bytes2g_min',StringType(),True)\
, StructField('bytes2g_max',StringType(),True)\
, StructField('bytes2g_avg',StringType(),True)\
, StructField('bytes2g_std',StringType(),True)\
, StructField('bytes2g_samples',StringType(),True)\
, StructField('reqs2g',StringType(),True)\
, StructField('bytes2g',StringType(),True)\
, StructField('rtt3g_p10',StringType(),True)\
, StructField('rtt3g_p25',StringType(),True)\
, StructField('rtt3g_p50',StringType(),True)\
, StructField('rtt3g_p75',StringType(),True)\
, StructField('rtt3g_p90',StringType(),True)\
, StructField('rtt3g_min',StringType(),True)\
, StructField('rtt3g_max',StringType(),True)\
, StructField('rtt3g_avg',StringType(),True)\
, StructField('rtt3g_stdev',StringType(),True)\
, StructField('rtt3g_samples',StringType(),True)\
, StructField('thput3g_p10',StringType(),True)\
, StructField('thput3g_p25',StringType(),True)\
, StructField('thput3g_p50',StringType(),True)\
, StructField('thput3g_p75',StringType(),True)\
, StructField('thput3g_p90',StringType(),True)\
, StructField('thput3g_min',StringType(),True)\
, StructField('thput3g_max',StringType(),True)\
, StructField('thput3g_avg',StringType(),True)\
, StructField('thput3g_stdev',StringType(),True)\
, StructField('thput3g_samples',StringType(),True)\
, StructField('retx3g_p10',StringType(),True)\
, StructField('retx3g_p25',StringType(),True)\
, StructField('retx3g_p50',StringType(),True)\
, StructField('retx3g_p75',StringType(),True)\
, StructField('retx3g_p90',StringType(),True)\
, StructField('retx3g_min',StringType(),True)\
, StructField('retx3g_max',StringType(),True)\
, StructField('retx3g_avg',StringType(),True)\
, StructField('retx3g_stdev',StringType(),True)\
, StructField('retx3g_samples',StringType(),True)\
, StructField('bytes3g_p10',StringType(),True)\
, StructField('bytes3g_p25',StringType(),True)\
, StructField('bytes3g_p50',StringType(),True)\
, StructField('bytes3g_p75',StringType(),True)\
, StructField('bytes3g_p90',StringType(),True)\
, StructField('bytes3g_min',StringType(),True)\
, StructField('bytes3g_max',StringType(),True)\
, StructField('bytes3g_avg',StringType(),True)\
, StructField('bytes3g_stdev',StringType(),True)\
, StructField('bytes3g_samples',StringType(),True)\
, StructField('reqs3g',StringType(),True)\
, StructField('bytes3g',StringType(),True)\
, StructField('rtt4g_p10',StringType(),True)\
, StructField('rtt4g_p25',StringType(),True)\
, StructField('rtt4g_p50',StringType(),True)\
, StructField('rtt4g_p75',StringType(),True)\
, StructField('rtt4g_p90',StringType(),True)\
, StructField('rtt4g_min',StringType(),True)\
, StructField('rtt4g_max',StringType(),True)\
, StructField('rtt4g_avg',StringType(),True)\
, StructField('rtt4g_stdev',StringType(),True)\
, StructField('rtt4g_samples',StringType(),True)\
, StructField('thput4g_p10',StringType(),True)\
, StructField('thput4g_p25',StringType(),True)\
, StructField('thput4g_p50',StringType(),True)\
, StructField('thput4g_p75',StringType(),True)\
, StructField('thput4g_p90',StringType(),True)\
, StructField('thput4g_min',StringType(),True)\
, StructField('thput4g_max',StringType(),True)\
, StructField('thput4g_avg',StringType(),True)\
, StructField('thput4g_stdev',StringType(),True)\
, StructField('thput4g_samples',StringType(),True)\
, StructField('retx4g_p10',StringType(),True)\
, StructField('retx4g_p25',StringType(),True)\
, StructField('retx4g_p50',StringType(),True)\
, StructField('retx4g_p75',StringType(),True)\
, StructField('retx4g_p90',StringType(),True)\
, StructField('retx4g_min',StringType(),True)\
, StructField('retx4g_max',StringType(),True)\
, StructField('retx4g_avg',StringType(),True)\
, StructField('retx4g_stdev',StringType(),True)\
, StructField('retx4g_samples',StringType(),True)\
, StructField('bytes4g_p10',StringType(),True)\
, StructField('bytes4g_p25',StringType(),True)\
, StructField('bytes4g_p50',StringType(),True)\
, StructField('bytes4g_p75',StringType(),True)\
, StructField('bytes4g_p90',StringType(),True)\
, StructField('bytes4g_min',StringType(),True)\
, StructField('bytes4g_max',StringType(),True)\
, StructField('bytes4g_avg',StringType(),True)\
, StructField('bytes4g_stdev',StringType(),True)\
, StructField('bytes4g_samples',StringType(),True)\
, StructField('reqs4g',StringType(),True)\
, StructField('bytes4g',StringType(),True)\
, StructField('rttall_p10',StringType(),True)\
, StructField('rttall_p25',StringType(),True)\
, StructField('rttall_p50',StringType(),True)\
, StructField('rttall_p75',StringType(),True)\
, StructField('rttall_p90',StringType(),True)\
, StructField('rttall_min',StringType(),True)\
, StructField('rttall_max',StringType(),True)\
, StructField('rttall_avg',StringType(),True)\
, StructField('rttall_stdev',StringType(),True)\
, StructField('rttall_samples',StringType(),True)\
, StructField('thputall_p10',StringType(),True)\
, StructField('thputall_p25',StringType(),True)\
, StructField('thputall_p50',StringType(),True)\
, StructField('thputall_p75',StringType(),True)\
, StructField('thputall_p90',StringType(),True)\
, StructField('thputall_min',StringType(),True)\
, StructField('thputall_max',StringType(),True)\
, StructField('thputall_avg',StringType(),True)\
, StructField('thputall_stdev',StringType(),True)\
, StructField('thputall_samples',StringType(),True)\
, StructField('retxall_p10',StringType(),True)\
, StructField('retxall_p25',StringType(),True)\
, StructField('retxall_p50',StringType(),True)\
, StructField('retxall_p75',StringType(),True)\
, StructField('retxall_p90',StringType(),True)\
, StructField('retxall_min',StringType(),True)\
, StructField('retxall_max',StringType(),True)\
, StructField('retxall_avg',StringType(),True)\
, StructField('retxall_stdev',StringType(),True)\
, StructField('retxall_samples',StringType(),True)\
, StructField('bytesall_p10',StringType(),True)\
, StructField('bytesall_p25',StringType(),True)\
, StructField('bytesall_p50',StringType(),True)\
, StructField('bytesall_p75',StringType(),True)\
, StructField('bytesall_p90',StringType(),True)\
, StructField('bytesall_min',StringType(),True)\
, StructField('bytesall_max',StringType(),True)\
, StructField('bytesall_avg',StringType(),True)\
, StructField('bytesall_stdev',StringType(),True)\
, StructField('bytesall_samples',StringType(),True)\
, StructField('reqsall',StringType(),True)\
, StructField('bytesall',StringType(),True)\
, StructField('byteshttp',StringType(),True)\
, StructField('byteshttps',StringType(),True)\
, StructField('bytestcp',StringType(),True)\
, StructField('reqshttp',StringType(),True)\
, StructField('reqshttps',StringType(),True)\
, StructField('reqstcp',StringType(),True)\
, StructField('app_unknown_reqs',StringType(),True)\
, StructField('app_unknown_bytes',StringType(),True)\
, StructField('app_tech_reqs',StringType(),True)\
, StructField('app_tech_bytes',StringType(),True)\
, StructField('app_advert_reqs',StringType(),True)\
, StructField('app_advert_bytes',StringType(),True)\
, StructField('app_pshop_reqs',StringType(),True)\
, StructField('app_pshop_bytes',StringType(),True)\
, StructField('app_busi_reqs',StringType(),True)\
, StructField('app_busi_bytes',StringType(),True)\
, StructField('app_o2pc_reqs',StringType(),True)\
, StructField('app_o2pc_bytes',StringType(),True)\
, StructField('app_cmc_reqs',StringType(),True)\
, StructField('app_cmc_bytes',StringType(),True)\
, StructField('app_mail_reqs',StringType(),True)\
, StructField('app_mail_bytes',StringType(),True)\
, StructField('app_pnews_reqs',StringType(),True)\
, StructField('app_pnews_bytes',StringType(),True)\
, StructField('app_stream_reqs',StringType(),True)\
, StructField('app_stream_bytes',StringType(),True)\
, StructField('app_photo_reqs',StringType(),True)\
, StructField('app_photo_bytes',StringType(),True)\
, StructField('app_cdn_reqs',StringType(),True)\
, StructField('app_cdn_bytes',StringType(),True)\
, StructField('app_search_reqs',StringType(),True)\
, StructField('app_search_bytes',StringType(),True)\
, StructField('app_enter_reqs',StringType(),True)\
, StructField('app_enter_bytes',StringType(),True)\
, StructField('app_ptrav_reqs',StringType(),True)\
, StructField('app_ptrav_bytes',StringType(),True)\
, StructField('app_game_reqs',StringType(),True)\
, StructField('app_game_bytes',StringType(),True)\
, StructField('app_pspor_reqs',StringType(),True)\
, StructField('app_pspor_bytes',StringType(),True)\
, StructField('app_porn_reqs',StringType(),True)\
, StructField('app_porn_bytes',StringType(),True)\
, StructField('app_ppers_reqs',StringType(),True)\
, StructField('app_ppers_bytes',StringType(),True)\
, StructField('app_chat_reqs',StringType(),True)\
, StructField('app_chat_bytes',StringType(),True)\
, StructField('app_email_reqs',StringType(),True)\
, StructField('app_email_bytes',StringType(),True)\
, StructField('app_ref_reqs',StringType(),True)\
, StructField('app_ref_bytes',StringType(),True)\
, StructField('app_heal_reqs',StringType(),True)\
, StructField('app_heal_bytes',StringType(),True)\
, StructField('app_portal_reqs',StringType(),True)\
, StructField('app_portal_bytes',StringType(),True)\
, StructField('app_gamb_reqs',StringType(),True)\
, StructField('app_gamb_bytes',StringType(),True)\
, StructField('app_kidz_reqs',StringType(),True)\
, StructField('app_kidz_bytes',StringType(),True)\
, StructField('app_bm_unks_reqs',StringType(),True)\
, StructField('app_bm_unks_bytes',StringType(),True)\
, StructField('app_pfina_reqs',StringType(),True)\
, StructField('app_pfina_bytes',StringType(),True)\
, StructField('app_realty_reqs',StringType(),True)\
, StructField('app_realty_bytes',StringType(),True)\
, StructField('app_food_reqs',StringType(),True)\
, StructField('app_food_bytes',StringType(),True)\
, StructField('app_host_reqs',StringType(),True)\
, StructField('app_host_bytes',StringType(),True)\
, StructField('app_auto_reqs',StringType(),True)\
, StructField('app_auto_bytes',StringType(),True)\
, StructField('app_blog_reqs',StringType(),True)\
, StructField('app_blog_bytes',StringType(),True)\
, StructField('app_mobe_reqs',StringType(),True)\
, StructField('app_mobe_bytes',StringType(),True)\
, StructField('app_music_reqs',StringType(),True)\
, StructField('app_music_bytes',StringType(),True)\
, StructField('app_pjobs_reqs',StringType(),True)\
, StructField('app_pjobs_bytes',StringType(),True)\
, StructField('app_edu_reqs',StringType(),True)\
, StructField('app_edu_bytes',StringType(),True)\
, StructField('app_place_reqs',StringType(),True)\
, StructField('app_place_bytes',StringType(),True)\
, StructField('app_pglam_reqs',StringType(),True)\
, StructField('app_pglam_bytes',StringType(),True)\
, StructField('app_hobby_reqs',StringType(),True)\
, StructField('app_hobby_bytes',StringType(),True)\
, StructField('app_voip_reqs',StringType(),True)\
, StructField('app_voip_bytes',StringType(),True)\
, StructField('app_pets_reqs',StringType(),True)\
, StructField('app_pets_bytes',StringType(),True)\
, StructField('app_gov_reqs',StringType(),True)\
, StructField('app_gov_bytes',StringType(),True)\
, StructField('app_peer_reqs',StringType(),True)\
, StructField('app_peer_bytes',StringType(),True)\
, StructField('app_forum_reqs',StringType(),True)\
, StructField('app_forum_bytes',StringType(),True)\
, StructField('app_reli_reqs',StringType(),True)\
, StructField('app_reli_bytes',StringType(),True)\
, StructField('app_law_reqs',StringType(),True)\
, StructField('app_law_bytes',StringType(),True)\
, StructField('app_noprof_reqs',StringType(),True)\
, StructField('app_noprof_bytes',StringType(),True)\
, StructField('app_o2bw_reqs',StringType(),True)\
, StructField('app_o2bw_bytes',StringType(),True)\
, StructField('app_artmus_reqs',StringType(),True)\
, StructField('app_artmus_bytes',StringType(),True)])

userperf_features = ['rtt3g_p10', 'rtt3g_p25', 'rtt3g_p50', 'rtt3g_p75', 'rtt3g_p90', 'rtt3g_min', 'rtt3g_max', 'rtt3g_avg', 'rtt3g_stdev'
                     , 'thput3g_p10', 'thput3g_p25', 'thput3g_p50', 'thput3g_p75', 'thput3g_p90', 'thput3g_min', 'thput3g_max', 'thput3g_avg', 'thput3g_stdev'
                     , 'retx3g_p10', 'retx3g_p25', 'retx3g_p50', 'retx3g_p75', 'retx3g_p90', 'retx3g_min', 'retx3g_max', 'retx3g_avg', 'retx3g_stdev'
                     , 'bytes3g_p10', 'bytes3g_p25', 'bytes3g_p50', 'bytes3g_p75', 'bytes3g_p90', 'bytes3g_min', 'bytes3g_max', 'bytes3g_avg', 'bytes3g_stdev'
                     , 'rtt4g_p10', 'rtt4g_p25', 'rtt4g_p50', 'rtt4g_p75', 'rtt4g_p90', 'rtt4g_min', 'rtt4g_max', 'rtt4g_avg', 'rtt4g_stdev'
                     , 'thput4g_p10', 'thput4g_p25', 'thput4g_p50', 'thput4g_p75', 'thput4g_p90', 'thput4g_min', 'thput4g_max', 'thput4g_avg', 'thput4g_stdev'
                     , 'retx4g_p10', 'retx4g_p25', 'retx4g_p50', 'retx4g_p75', 'retx4g_p90', 'retx4g_min', 'retx4g_max', 'retx4g_avg', 'retx4g_stdev'
                     , 'bytes4g_p10', 'bytes4g_p25', 'bytes4g_p50', 'bytes4g_p75', 'bytes4g_p90', 'bytes4g_min', 'bytes4g_max', 'bytes4g_avg', 'bytes4g_stdev']

# Section 2 Aggregated UserPerf Metrics for whole dataset
## Section 2.1 Data gathering and transformation

In [16]:
# 1. clear additional characters from the files - unicode parenthesis u'' and brackets []
# 2. fill empty values with 'null'
# 3. apply schema according to provided file
#

for data_dir,output_file,home_antenna in zip(data_dirs,output_files,home_antenna_files):
    home_antenna_df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load(data_dir + home_antenna)
    
    data_df = sc.textFile(data_dir + 'UserPerf/*/part-*').map(lambda x: x.replace("u''","null").replace("[","").replace("]","").replace("u'","").replace("'",'').split(',')).toDF(schema=Webanalytics_v1_schema)    
    data_df = data_df.join(home_antenna_df,[home_antenna_df['device_id']==data_df['userid']], 'left').drop('userid')
    data_df = data_df.filter(col('antenna_id').isNotNull())
    data_df = data_df.groupby(['device_id', 'geometry', 'antenna_id']).agg(sum('bytesall'), sum('bytes4g'), sum('bytes3g'), avg('rtt4g_avg'), avg('retx4g_avg'), avg('rtt3g_avg'), avg('retx3g_avg'), avg('rttall_avg'), avg('retxall_avg'))
    
    exprs = [avg(x) for x in userperf_features]
    UserMLDataset = data_df.groupby(['device_id', 'geometry', 'antenna_id']).agg(*exprs)
    
    UserMLDataset.coalesce(1).write.mode('append').format('com.databricks.spark.csv').option('header', 'true').save(data_dir + output_file + '_MLdataset')
    data_df.coalesce(1).write.mode('append').format('com.databricks.spark.csv').option('header', 'true').save(data_dir + output_file)