## Imports

In [1]:
import argparse
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SQLContext
from hops import hdfs
from pyspark.sql.functions import udf
from pyspark.sql.types import *

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1944,application_1512575073636_0472,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


## Constants

In [2]:
project_path = "/Projects/" + hdfs.project_name()
PA_DATA = project_path + "/HAR_Dataset/Phones_accelerometer.csv"
PG_DATA = project_path + "/HAR_Dataset/Phones_gyroscope.csv"
WA_DATA = project_path + "/HAR_Dataset/Watch_accelerometer.csv"
WG_DATA = project_path + "/HAR_Dataset/Watch_gyroscope.csv"

sc = spark.sparkContext
sql = SQLContext(sc)

schema_flow = (StructType().add('Index', IntegerType(), True)
        .add('Arrival_Time', LongType(), True)
        .add('Creation_Time', LongType(), True)
        .add('x', DoubleType(), True)
        .add('y', DoubleType(), True)
        .add('z', DoubleType(), True)
        .add('User', StringType(), True)
        .add('Model', StringType(), True)
        .add('Device', StringType(), True)
        .add('gt', StringType(), True)) 

## Read Data

In [3]:
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("-n", "--num-partitions", help="Number of output partitions", type=int, default=10)
    parser.add_argument("-o", "--output", help="HDFS directory to save examples in parallelized format", default=project_path + "/HAR_Dataset/cleaned_data")
    parser.add_argument("-c", "--cluster", help="run on cluster master or local master", action="store_true")
    args = parser.parse_args()
    return args

In [4]:
def read_raw_data(num_partitions):
    paRaw = (spark.read.schema(schema_flow).option("header", "true").option("numPartitons", num_partitions).csv(PA_DATA))
    return paRaw
    #gaRaw = (spark.read.schema(schema_flow).option("header", "true").option("numPartitons", num_partitions).csv(GA_DATA))
    #waRaw = (spark.read.schema(schema_flow).option("header", "true").option("numPartitons", num_partitions).csv(WA_DATA))
    #wgRaw = (spark.read.schema(schema_flow).option("header", "true").option("numPartitons", num_partitions).csv(WG_DATA))
    #return paRaw.unionAll(gaRaw).unionAll(waRaw).unionAll(wgRaw)

### Read Raw Data

In [5]:
args = parse_args()
raw_data = read_raw_data(args.num_partitions)
raw_data = raw_data.drop("Index")
raw_data_size = raw_data.count()

### Split into Train & Test

In [6]:
train_raw, test_raw = raw_data.randomSplit([0.8, 0.2])
train_data_size = train_raw.count()
test_data_size = test_raw.count()
train_raw = train_raw.limit(train_data_size - (train_data_size % 1000))
train_data_size = train_data_size - (train_data_size % 1000)
test_raw = test_raw.limit(test_data_size - (test_data_size % 1000))
test_data_size = test_data_size - (test_data_size % 1000)
#train_raw = train_raw.repartition(args.num_partitions)
#test_raw = test_raw.repartition(args.num_partitions)

### Convert Categorical Features into Numerical

In [7]:
classes = raw_data.select("gt").distinct().rdd.map(lambda row: row.gt).zipWithIndex().collectAsMap()
devices = raw_data.select("Device").distinct().rdd.map(lambda row: row.Device).zipWithIndex().collectAsMap()
models = raw_data.select("Model").distinct().rdd.map(lambda row: row.Model).zipWithIndex().collectAsMap()
users = raw_data.select("User").distinct().rdd.map(lambda row: row.User).zipWithIndex().collectAsMap()
labels_train = train_raw.select("gt").rdd.map(lambda row: classes[row.gt])
features_train = train_raw.drop("gt").rdd.map(lambda row: (row.Arrival_Time,row.Creation_Time,row.x,row.y,row.z,users[row.User],models[row.Model],devices[row.Device]))
labels_test = test_raw.select("gt").rdd.map(lambda row: classes[row.gt])
features_test = test_raw.drop("gt").rdd.map(lambda row: (row.Arrival_Time,row.Creation_Time,row.x,row.y,row.z,users[row.User],models[row.Model],devices[row.Device]))

### Normalize Data

In [8]:
# TODO

### Convert To Fixed Length Sequences

In [9]:
# TODO

### Save Cleaned Data

In [10]:
features_train.repartition(args.num_partitions).map(lambda row: ','.join([str(i) for i in list(row)])).saveAsTextFile(args.output + "/train/features")
labels_train.repartition(args.num_partitions).map(lambda row: ','.join(str(row))).saveAsTextFile(args.output + "/train/labels")
features_test.repartition(args.num_partitions).map(lambda row: ','.join([str(i) for i in list(row)])).saveAsTextFile(args.output + "/test/features")
labels_test.repartition(args.num_partitions).map(lambda row: ','.join(str(row))).saveAsTextFile(args.output + "/test/labels")

In [11]:
sc.parallelize([(k,v) for k,v in classes.items()]).saveAsTextFile(args.output + "/classes")
sc.parallelize([(k,v) for k,v in devices.items()]).saveAsTextFile(args.output + "/devices")
sc.parallelize([(k,v) for k,v in models.items()]).saveAsTextFile(args.output + "/models")
sc.parallelize([(k,v) for k,v in users.items()]).saveAsTextFile(args.output + "/users")