In [7]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pysparkling import *
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
import h2o
from pyspark.sql.functions import lit

In [8]:
ss = SparkSession.builder.getOrCreate()

In [9]:
wrist2 = ss.read.csv('Wrist/S_2.csv', header=True, inferSchema=True)

In [10]:
def clean_df(df, sub_id):
    df = df.filter(df.TEMP.isNotNull())
    df = df.withColumn('sub_id', lit(f'S_{sub_id}'))
    return df

In [11]:
wrists = clean_df(wrist2, 2)
nums = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]
for i in nums[1:]:
    wrist = ss.read.csv(f"Wrist/S_{i}.csv", header=True, inferSchema=True)
    wrist = clean_df(wrist, i)
    wrists = wrists.union(wrist)
wrists.cache()

DataFrame[_c0: double, label: double, ACC_0: double, ACC_1: double, ACC_2: double, BVP: double, EDA: double, TEMP: double, sub_id: string]

In [12]:
wrists = wrists.filter(wrists.label.isin([1, 2, 3]))

In [13]:
wrists.count()

132608

# Register and load from HDFS

In [11]:
wrists.write.option('path', './wristdat').saveAsTable('wristdat')

In [4]:
ss.sql("select * from parquet.`./wristdat`").count()

AnalysisException: 'Incomplete HDFS URI, no host: hdfs://./wristdat; line 1 pos 14'

# Scratch Work

In [18]:
wrist = wrist.filter(wrist.TEMP != 'null')

In [21]:
wrist = wrist.withColumn('sub_id', lit('S_2')).show(5)

+----+-----+-----+-----+-----+-------+--------+-----+------+
| _c0|label|ACC_0|ACC_1|ACC_2|    BVP|     EDA| TEMP|sub_id|
+----+-----+-----+-----+-----+-------+--------+-----+------+
| 0.0|  0.0| 62.0|-21.0|107.0| -59.37|1.138257|35.41|   S_2|
|0.25|  0.0| 51.0| 16.0| 35.0|   43.5|1.125444|35.41|   S_2|
| 0.5|  0.0| 53.0| 21.0| -6.0|  53.56|1.011405|35.41|   S_2|
|0.75|  0.0| 55.0| 17.0| 34.0|  54.64|1.033188|35.41|   S_2|
| 1.0|  0.0| 48.0| 24.0| 15.0|-117.88|0.935807|35.41|   S_2|
+----+-----+-----+-----+-----+-------+--------+-----+------+
only showing top 5 rows



In [29]:
wrist2 = clean_df(wrist2, 2)
wrist3 = clean_df(wrist3, 3)

In [31]:
wrist2.show(5)

+----+-----+-----+-----+-----+-------+--------+-----+------+
| _c0|label|ACC_0|ACC_1|ACC_2|    BVP|     EDA| TEMP|sub_id|
+----+-----+-----+-----+-----+-------+--------+-----+------+
| 0.0|  0.0| 62.0|-21.0|107.0| -59.37|1.138257|35.41|   S_2|
|0.25|  0.0| 51.0| 16.0| 35.0|   43.5|1.125444|35.41|   S_2|
| 0.5|  0.0| 53.0| 21.0| -6.0|  53.56|1.011405|35.41|   S_2|
|0.75|  0.0| 55.0| 17.0| 34.0|  54.64|1.033188|35.41|   S_2|
| 1.0|  0.0| 48.0| 24.0| 15.0|-117.88|0.935807|35.41|   S_2|
+----+-----+-----+-----+-----+-------+--------+-----+------+
only showing top 5 rows



In [36]:
df = wrist2.union(wrist3)

In [40]:
df.count()

50288

In [41]:
wrist2.count() + wrist3.count()

50288