In [1]:
import os
import json

In [2]:
with open('../service_schema.json') as f:
    schema = json.loads(f.read())

input = schema['input']['input_df']['swagger']['example']

input_df = sqlContext.createDataFrame(input)

input_df.printSchema()



root
 |-- ambient_pressure: double (nullable = true)
 |-- ambient_temperature: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- vibration: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [3]:
%%writefile ../score.py

import os
import numpy as np
from pyspark.sql.types import DoubleType, ArrayType, ShortType, LongType, IntegerType
from functools import reduce
from pyspark.ml.feature import StringIndexer, VectorAssembler, VectorIndexer
from pyspark.sql.functions import udf, mean, lit, stddev, col, expr, when

sample_rate = 8000

def init():
    from pyspark.ml import PipelineModel
    global pipeline
    dir_path = os.path.dirname(os.path.realpath(__file__))
    pipeline = PipelineModel.load(os.path.join(dir_path, 'model'))

def run(input_df):
    def extract_amplitude_spectrum():
        def m(x):
            ampl = np.abs(np.fft.fft(x)) / sample_rate
            return ampl[:sample_rate // 2 + 1].tolist()

        return udf(m, ArrayType(DoubleType()))

    def extract_dominant_frequencies(index):
        def m(ff):
            freq = np.fft.fftfreq(sample_rate, d = 1/sample_rate)[:sample_rate // 2 + 1]
            return [float(f[index]) for f in sorted(list(zip(freq, ff)), key = lambda x: x[1], reverse = True)]
    
        return udf(m, ArrayType(DoubleType()))
    
    input_df = input_df.withColumn("fft", extract_amplitude_spectrum()(input_df.vibration))
    dfa = (input_df
       .withColumn("dominant_frequencies", extract_dominant_frequencies(0)(input_df.fft))
       .withColumn("dominant_frequencies_amplitudes", extract_dominant_frequencies(1)(input_df.fft)))
    
    frequency_features = 3
    dfa = reduce(lambda _df, i: _df.withColumn('f{0}'.format(i), _df.dominant_frequencies[i]), range(frequency_features), dfa)
    dfa = reduce(lambda _df, i: _df.withColumn('a{0}'.format(i), _df.dominant_frequencies_amplitudes[i]), range(frequency_features), dfa)
    dfa = dfa.drop('vibration', 'fft', 'dominant_frequencies', 'dominant_frequencies_amplitudes')
    
    dependent_features = [c for c in dfa.columns if c not in ['machineID', 'EnqueuedTimeUtc', 'speed', 'ambient_temperature', 'ambient_pressure']]
    dfa = reduce(lambda _df, f: _df.withColumn('{0}_n'.format(f), col(f) / col('speed')), dependent_features, dfa)
    
    features = sorted([c for c in dfa.columns if c not in ['machineID', 'EnqueuedTimeUtc']])

    # assemble features
    va = VectorAssembler(inputCols=features, outputCol='features')

    feat_data = va.transform(dfa)

    predictions = pipeline.transform(feat_data).collect()

    #Get each scored result
    preds = [str(x['prediction']) for x in predictions]
    return ",".join(preds)


Overwriting ../score.py


In [4]:
%run ../score.py

In [5]:
init()   

In [6]:
run(input_df)

'1.0,2.0,2.0'