# SKFlow Simple Linear Regression on Parkinson's Disease Speech Dataset

https://archive.ics.uci.edu/ml/datasets/Parkinson+Speech+Dataset+with++Multiple+Types+of+Sound+Recordings

In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow.contrib.learn.python.learn as skflow
from sklearn import datasets, metrics
import numpy as np
import os.path
import pickle

In [2]:
### Converting Data into Tensors
def convert_dataframe(dataframe, colnames, labelcol):
    X = np.array(dataframe[colnames].iloc[:,]).astype(np.float32)
    Y = np.array(dataframe[labelcol].iloc[:,]).astype(np.float32)  
    return X, Y

### load or generate random ids for test and training sets
def slice_ids(length,run_id):
    idfile = "models/"+run_id+"_ids"
    if os.path.isfile(idfile): 
        ### Re-run model
        with open(idfile, 'rb') as fp:
            randomInd = pickle.load(fp)
    else:
        ### First run
        randomInd = np.random.permutation(length)
        with open(idfile, 'wb') as fp:
            pickle.dump(randomInd, fp)
    
    ###  70% of ids randomly selected for training
    ###  30% of ids randomly selected for testing
    mid = int(.7*length)
    trainidx = randomInd[:mid]
    testidx = randomInd[mid:]
    return trainidx, testidx

def run_LinearRegressor(run_id, steps = 100):
    ### Get Training and Testing indexes
    trainidx, testidx = slice_ids(len(X),run_id)

    ### Define classifier - Simple Linear Regression
    ### model_dir - this is where the model is saved. To re-run use the same runIdentifier
    feature_columns = skflow.infer_real_valued_columns_from_input(X)
    model = skflow.LinearRegressor(
        feature_columns=feature_columns,
        model_dir="models/SLR/"+run_id,
        enable_centered_bias=False)
    
    model.fit(X[trainidx,:], 
              Y[trainidx],#.astype(np.int), 
              steps=steps)
    
    y_p = model.predict(X[testidx,:])
    y_t = Y[testidx]
    return np.linalg.norm(y_t-y_p)/y_t.shape[0]

# Load data and convert to tensors

In [3]:
COLUMN_NAMES = ["subject_id", "jitter_local", "jitter_local_absolute", "jitter_rap", "jitter_ppq5",
                 "jitter_ddp","shimmer_local","shimmer_local_db","shimmer_apq3","shimmer_apq5",
                 "shimmer_apq11","shimmer_dda","ac","nth","htn","pitch_median","pitch_mean","pitch_stddev",
                 "pitch_min","pitch_max","number_of_pulses", "number_of_periods", "period_mean",
                 "period_stddev","locally_unvoiced_frames_fraction","number_of_voice_breaks",
                 "degree_of_voice_breaks","updrs","class_information"]

datafile = "data/train_data.txt"
df_data = pd.read_csv(datafile, names=COLUMN_NAMES, header=None)

### Create list of column names for feature data.
featcol = list(df_data.columns.values)
featcol.remove("subject_id")
featcol.remove("class_information")
featcol.remove("updrs")

In [4]:
labelcol = "updrs" 
X, Y = convert_dataframe(df_data, featcol, labelcol)

# Run Some Different tests

In [5]:
%%time
print "Average Error: {0}".format(run_LinearRegressor("SLR_UPDRS_001", 10000))

Instructions for updating:
The default behavior of predict() is changing. The default value for
as_iterable will change to True, and then the flag will be removed
altogether. The behavior of this flag is described below.


Average Error: 0.83665622809
CPU times: user 30.5 s, sys: 2.97 s, total: 33.5 s
Wall time: 30 s


In [6]:
%%time
print "Average Error: {0}".format(run_LinearRegressor("SLR_UPDRS_002", 10000))

Instructions for updating:
The default behavior of predict() is changing. The default value for
as_iterable will change to True, and then the flag will be removed
altogether. The behavior of this flag is described below.


Average Error: 0.857258429894
CPU times: user 31.4 s, sys: 3.03 s, total: 34.4 s
Wall time: 29.7 s


In [7]:
%%time
print "Average Error: {0}".format(run_LinearRegressor("SLR_UPDRS_003", 10000))

Instructions for updating:
The default behavior of predict() is changing. The default value for
as_iterable will change to True, and then the flag will be removed
altogether. The behavior of this flag is described below.


Average Error: 0.910520309057
CPU times: user 31.7 s, sys: 3.06 s, total: 34.7 s
Wall time: 30 s


In [8]:
%%time
print "Average Error: {0}".format(run_LinearRegressor("SLR_UPDRS_004", 10000))

Instructions for updating:
The default behavior of predict() is changing. The default value for
as_iterable will change to True, and then the flag will be removed
altogether. The behavior of this flag is described below.


Average Error: 0.861911969307
CPU times: user 31.6 s, sys: 2.98 s, total: 34.5 s
Wall time: 29.8 s


# Use Models to predict UPDRS Scores for the Test patients - No way to verify

In [10]:
def predictValues(run_id, testX):
    ### Define classifier - Simple Linear Regression
    ### model_dir - this is where the model is saved. To re-run use the same runIdentifier
    feature_columns = skflow.infer_real_valued_columns_from_input(X)
    model = skflow.LinearRegressor(
        feature_columns=feature_columns,
        model_dir="models/SLR/"+run_id,
        enable_centered_bias=False)

    y_p = model.predict(testX)
    return y_p    

In [11]:
COLUMN_NAMES = ["subject_id", "jitter_local", "jitter_local_absolute", "jitter_rap", "jitter_ppq5",
                 "jitter_ddp","shimmer_local","shimmer_local_db","shimmer_apq3","shimmer_apq5",
                 "shimmer_apq11","shimmer_dda","ac","nth","htn","pitch_median","pitch_mean","pitch_stddev",
                 "pitch_min","pitch_max","number_of_pulses", "number_of_periods", "period_mean",
                 "period_stddev","locally_unvoiced_frames_fraction","number_of_voice_breaks",
                 "degree_of_voice_breaks","class_information"]

datafile = "data/test_data.txt"
df_data = pd.read_csv(datafile, names=COLUMN_NAMES, header=None)

featcol = list(df_data.columns.values)
featcol.remove("subject_id")
featcol.remove("class_information")
testData = np.array(df_data[featcol].iloc[:,]).astype(np.float32)

In [12]:
%%time
predictValues("SLR_UPDRS_004", testData)

Instructions for updating:
The default behavior of predict() is changing. The default value for
as_iterable will change to True, and then the flag will be removed
altogether. The behavior of this flag is described below.


CPU times: user 73.3 ms, sys: 5.11 ms, total: 78.5 ms
Wall time: 78.4 ms


array([  8.90001965,  10.18336105,   9.72231007,  11.58966446,
        11.84992886,  11.24682903,   9.94858074,   9.10394382,
         8.50317764,  11.6996727 ,  10.37724972,  10.47740078,
        12.37574959,  15.5326376 ,   8.77845764,   4.49829292,
         8.22646809,   7.98786545,   7.09722519,  10.29768753,
        12.8442812 ,   6.39231968,   8.904603  ,  12.9677515 ,
         5.53757668,   6.96958733,  13.39861012,  15.51123333,
         7.22864199,  11.38257885,   6.52732658,   8.09049129,
         6.89475155,   9.11819363,   9.36974335,   9.87164593,
         8.98575687,   8.71587276,   7.75513363,   9.21525097,
         9.19638348,   8.3857851 ,  10.92054272,   9.62572861,
        10.94377995,  10.36156845,   9.21939754,  10.76749516,
        11.46788883,   9.12350082,   9.79336262,  10.44120312,
         9.26424694,   9.49556065,   9.54784966,   9.54784966,
        12.55902958,   9.83314228,   9.14494324,   8.38248444,
         8.90461063,   9.20493031,   9.37521267,   9.82

In [13]:
%%time
predictValues("SLR_UPDRS_001", testData)

Instructions for updating:
The default behavior of predict() is changing. The default value for
as_iterable will change to True, and then the flag will be removed
altogether. The behavior of this flag is described below.


CPU times: user 74.4 ms, sys: 3.21 ms, total: 77.6 ms
Wall time: 77.2 ms


array([ 10.12992859,  11.63660049,  11.09392166,  13.16503429,
        13.42690182,  12.46626568,  10.84868622,  10.25620079,
         9.4499588 ,  12.97682571,  11.42319679,  12.10482979,
         9.93821716,  14.40267754,   9.49258995,   5.4149971 ,
         9.38891888,   9.04293442,   7.18389368,   9.06473827,
        12.30064964,   7.25803614,   9.63912201,  13.01989937,
         7.32932711,   8.5105505 ,  14.1109972 ,  17.27428818,
         9.01781178,  12.36394596,   6.94590712,   9.48592758,
         7.17932749,   9.92714405,   9.95744991,  10.93894196,
         8.80636501,   8.88370228,   8.12825394,   9.94732857,
         9.78580475,   8.99827385,  12.59344959,  11.19831467,
        12.72230339,  11.521451  ,  10.15033054,  12.2167635 ,
        11.33331394,   9.79965401,  10.81504154,  11.7452507 ,
        10.30544376,  10.53654099,  10.52593803,  10.52593803,
        12.29662895,   9.38390732,  10.44731426,   8.86032486,
         9.6654377 ,   9.56851387,   9.69787216,  10.63