# Section 1 Initialization
## Section 1.1 Copy results from "UK_HomeDetection_IMC20" and "IMC2020_AntennaIMDAssignment" to HDFS

In [1]:
%%bash
hdfs dfs -copyFromLocal datasets/Telefonica_Antenna/XG/telefonica_antenna_london_imd_pd.csv QoE/London/
hdfs dfs -copyFromLocal datasets/Telefonica_Antenna/XG/telefonica_antenna_birmingham_imd_pd.csv QoE/Birmingham/
hdfs dfs -copyFromLocal datasets/Telefonica_Antenna/XG/telefonica_antenna_liverpool_imd_pd.csv QoE/Liverpool/

hdfs dfs -copyFromLocal datasets/012020/home_ldn.csv QoE/London/London_Jan_2020/
hdfs dfs -copyFromLocal datasets/012020/home_birm.csv QoE/Birmingham/Birmingham_Jan_2020/
hdfs dfs -copyFromLocal datasets/012020/home_lpool.csv QoE/Liverpool/Liverpool_Jan_2020/

## Section 1.2 PySpark and Python modules

In [1]:
import os
os.environ["SPARK_HOME"] = '/usr/local/spark/spark-1.6.2-bin-hadoop2.6'
os.environ['PYSPARK_SUBMIT_ARGS'] = "--master local[*] --deploy-mode client --packages com.databricks:spark-csv_2.11:1.3.0 pyspark-shell"

import pyspark
from pyspark import SparkConf, SparkContext
from pyspark import sql
from pyspark.sql import HiveContext, Window, SQLContext
from pyspark.sql.types import DoubleType, StructType, StructField, StringType
from pyspark.sql.functions import col, lit, count, sum, avg, max, array

print('starting')
conf = SparkConf().setAppName('ML_datasets')
sc = SparkContext(conf=conf)

# we need HiveContext to use Hive builtin functions:
# hive builtin functions : https://support.treasuredata.com/hc/en-us/articles/360001457367-Hive-Built-in-Aggregate-Functions
sqlContext = HiveContext(sc)

print('finished')

starting
finished


## Section 1.2 Define datasets we are going to use

In [2]:
data_dirs = ['QoE/Liverpool/Liverpool_Jan_2020/',
             'QoE/London/London_Jan_2020/',
             'QoE/Birmingham/Birmingham_Jan_2020/']

output_files = ['lpool_012020',
                'lndn_012020',
                'birm_012020']

home_antenna_files = ['home_lpool.csv',
                     'home_ldn.csv',
                     'home_birm.csv']

antenna_info_files = ['QoE/Liverpool/telefonica_antenna_liverpool_imd_pd.csv',
                      'QoE/London/telefonica_antenna_london_imd_pd.csv',
                      'QoE/Birmingham/telefonica_antenna_birmingham_imd_pd.csv']

days = ['01', '02', '03']

data_schema = StructType([StructField('device_id', StringType(), True),
                          StructField('antenna_id', StringType(), True),
                          StructField('time_spent', DoubleType(), True),
                          StructField('dt', StringType(), True)])

# Mobility/CDR features

In [3]:
for data_dir,output_file,home_antenna,antenna_info_file in zip(data_dirs[:1],output_files[:1],home_antenna_files[:1],antenna_info_files[:1]):
    home_antenna_df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load(data_dir + home_antenna)
    data_df = sqlContext.createDataFrame(sc.emptyRDD(), data_schema)    
    for day in days:
        data_temp = sc.textFile(data_dir + day + '/hour=*/part-*').map(lambda x: x.split('\t'))
        data_temp_df = data_temp.flatMap(lambda x: [(x[0], x[i], x[i+1]) for i in range(5,len(x),2)])\
                                .toDF(('device_id', 'antenna_id', 'time_spent'))\
                                .withColumn('dt', lit(day))
        data_df = data_df.unionAll(data_temp_df)
        
    home_antenna_df = home_antenna_df.withColumnRenamed('antenna_id', 'antenna_idd')
    home_antenna_df = home_antenna_df.drop('geometry')

    antenna_info = sqlContext.read.format('com.databricks.spark.csv').option('header', 'true').option('inferSchema', 'true').load(antenna_info_file)
    antenna_info = antenna_info.drop('geometry_voronoi')
    antenna_info = antenna_info.drop('geometry')
    antenna_info = antenna_info.withColumnRenamed('lkey', 'antenna_id')
    
    ResidentSumGenTimeSpent_df = data_df.join(home_antenna_df,'device_id')\
                                        .join(antenna_info, 'antenna_id')\
                                        .groupby('device_id','dt')\
                                        .pivot('generation')\
                                        .agg(sum('time_spent').alias('sum(time_spent)'))\
                                        .fillna(0)    
    ResidentSumGenTimeSpent_df = ResidentSumGenTimeSpent_df.groupby('device_id').mean()
    
    ResidentSumGenImdTimeSpent_df = data_df.join(home_antenna_df,'device_id')\
                                          .join(antenna_info, 'antenna_id')\
                                          .groupby('device_id','dt')\
                                          .pivot('IMDDecil')\
                                          .agg(sum('time_spent'))\
                                          .fillna(0)
    ResidentSumGenImdTimeSpent_df = ResidentSumGenImdTimeSpent_df.groupby('device_id').mean()\
                                                                 .join(ResidentSumGenTimeSpent_df,'device_id')

    AntennatSumImdTimeSpentCntDev_df = data_df.join(home_antenna_df,'device_id')\
                                              .join(antenna_info, 'antenna_id')\
                                              .groupby('antenna_id','dt')\
                                              .pivot('IMDDecil')\
                                              .agg(sum('time_spent'),count('device_id'))\
                                              .fillna(0)
    AntennatSumImdTimeSpentCntDev_df = AntennatSumImdTimeSpentCntDev_df.groupby('antenna_id').mean()

    ResidentSumGenImdTimeSpent_df.coalesce(1).write.mode('append').format('com.databricks.spark.csv').option('header', 'true').save(data_dir + output_file + '_' + 'ResidentSumGenImdTimeSpent')
    AntennatSumImdTimeSpentCntDev_df.coalesce(1).write.mode('append').format('com.databricks.spark.csv').option('header', 'true').save(data_dir + output_file + '_' + 'AntennatSumImdTimeSpentCntDev')