In [1]:
# All Includes

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf  # Version 1.0.0 (some previous versions are used in past commits)
from sklearn import metrics

import os

In [2]:
# Useful Constants

# Those are separate normalised input features for the neural network
INPUT_SIGNAL_TYPES = [
    "body_acc_x_",
    "body_acc_y_",
    "body_acc_z_",
    "body_gyro_x_",
    "body_gyro_y_",
    "body_gyro_z_",
    "total_acc_x_",
    "total_acc_y_",
    "total_acc_z_"
]

# Output classes to learn how to classify
LABELS = [
    "WALKING", 
    "WALKING_UPSTAIRS", 
    "WALKING_DOWNSTAIRS", 
    "SITTING", 
    "STANDING", 
    "LAYING"
]

In [3]:
DATASET_PATH = "/FileStore/tables/"
TRAIN = "train/"
TEST = "test/"


# Load "X" (the neural network's training and testing inputs)

def load_X(X_signals_paths):
    X_signals = []
    
    for signal_type_path in X_signals_paths:
      #with open(signal_type_path, "r") as file:
        file = open("/dbfs/"+signal_type_path, 'r')
        # Read dataset from disk, dealing with text files' syntax
        X_signals.append(
            [np.array(serie, dtype=np.float32) for serie in [
                row.replace('  ', ' ').strip().split(' ') for row in file
            ]]
        )
        file.close()
    
    return np.transpose(np.array(X_signals), (1, 2, 0))

X_train_signals_paths = [
    DATASET_PATH + signal + "train.txt" for signal in INPUT_SIGNAL_TYPES
]
X_test_signals_paths = [
    DATASET_PATH + TEST +  signal + "test.txt" for signal in INPUT_SIGNAL_TYPES
]

X_train = load_X(X_train_signals_paths)
X_test = load_X(X_test_signals_paths)


In [4]:
# Load "y" (the neural network's training and testing outputs)

def load_y(y_path):
    file = open("/dbfs"+y_path, 'r')
    # Read dataset from disk, dealing with text file's syntax
    y_ = np.array(
        [elem for elem in [
            row.replace('  ', ' ').strip().split(' ') for row in file
        ]], 
        dtype=np.int32
    )
    file.close()
    
    # Substract 1 to each output class for friendly 0-based indexing 
    return y_ - 1

y_train_path = DATASET_PATH + "y_train.txt"
y_test_path = DATASET_PATH + TEST + "y_test.txt"

y_train = load_y(y_train_path)
y_test = load_y(y_test_path)

In [5]:
# Input Data 

training_data_count = len(X_train)  # 7352 training series (with 50% overlap between each serie)
test_data_count = len(X_test)  # 2947 testing series
n_steps = len(X_train[0])  # 128 timesteps per series
n_input = len(X_train[0][0])  # 9 input parameters per timestep

In [6]:
xtrain_df = pd.DataFrame(X_train.reshape(len(X_train),1152))
ytrain_df = pd.DataFrame(y_train.reshape(len(y_train),1))
xtest_df = pd.DataFrame(X_test.reshape(len(X_test),1152))
ytest_df = pd.DataFrame(y_test.reshape(len(y_test),1))

In [7]:
feature561_df = pd.read_csv('/dbfs/FileStore/tables/features.txt',header= None, sep='\s+')
subjectTrain_df = pd.read_csv('/dbfs/FileStore/tables/subject_train.txt',header= None, sep='\s+')
activity_lables = pd.read_csv('/dbfs/FileStore/tables/activity_labels.txt',header= None, sep='\s+')

In [8]:
ytrain_df = ytrain_df.rename(columns = {0:'features'})
subjectTrain_df = subjectTrain_df.rename(columns = {0:'users'})
activity_lables = activity_lables.rename(columns = {0:'key',1:'value'})

In [9]:
train_df=pd.concat([xtrain_df,ytrain_df,subjectTrain_df], axis=1)

In [10]:
activity_lables

In [11]:
train_df = train_df.join(activity_lables.set_index('key'), on='features')

In [12]:
train_df["id"] = train_df.index+1 

In [13]:
train_df[['users','value','features',"id"]]

In [14]:
import graphframes
from graphframes import *

In [15]:
from pyspark.sql.types import *
#changed_train_df = train_df.select("users", train_df.users.cast("String").alias("s_users")) 
train_df['s_users'] = train_df['users'].astype(str)

In [16]:
train_df.dtypes


In [17]:
localvertices = train_df[["id","features"]]
v = sqlContext.createDataFrame(localvertices, ["id","Features"])

In [18]:
display(v)

In [19]:
localedges = train_df[["users","features","id"]]
e = sqlContext.createDataFrame(localedges, ["src","dst","id_users"])

In [20]:
display(e)

In [21]:
g = GraphFrame(v,e)

In [22]:
display(g.inDegrees)

In [23]:
display(g.outDegrees)

In [24]:
user_count = g.edges.filter("src = '1'").count()
print "Activities by user 1: ",user_count

In [25]:
activity_count = g.edges.filter("dst = '1'").count()
print "Walking count by all users: ", activity_count