In [8]:
import numpy as np
import pandas as pd
import glob
from datetime import timedelta
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import itertools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [9]:
def is_nan(x):
    '''
    Checks if x is nan or not.
    Taken from https://stackoverflow.com/questions/944700/how-can-i-check-for-nan-values.

    :param x: any type of item to check for if it is nan type

    Returns True if it is nan type, false if not.
    '''

    return (x is np.nan or x != x)

In [77]:
filename_patient_1 = './patientData/patientAS14.01.csv'
filename_patient_2 = './patientData/patientAS14.02.csv'

df_1 = create_df_per_patient(filename_patient_1)
df_2 = create_df_per_patient(filename_patient_2)
df

Unnamed: 0_level_0,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.other,appCat.social,appCat.travel,appCat.utilities,call,screen,sms
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2014-03-16,0.0,0.0,0.0,6.333333,2437.046,3151.829,88.392,408.106,181.591,0.0,0.0,4.0,4902.627,1.0
2014-03-17,0.168068,0.5,0.5,6.75,1229.347,4518.116,2511.178,103.699,103.028,0.0,0.0,4.0,10159.769001,0.0
2014-03-18,0.219484,0.6,0.8,8.2,2604.521,3933.081,88.943,88.722,212.364,0.0,0.0,15.0,4512.471001,1.0
2014-03-19,0.315632,0.4,0.4,6.8,1554.558,2596.989,196.139,105.536,47.815,567.165,0.0,5.0,8182.757,1.0
2014-03-20,0.239025,-0.5,0.75,7.25,1777.977,2154.371,148.18,86.857,131.598,0.0,0.0,10.0,4360.334001,1.0
2014-03-21,0.289628,-0.75,0.75,7.5,2230.303,2858.446,459.604,37.944,388.021,442.296,0.0,10.0,11155.850999,1.0
2014-03-22,0.446044,-1.0,-0.5,6.5,1930.266,1852.256,662.911,43.204,408.667,780.617,0.0,25.0,7589.562003,1.0
2014-03-23,0.269697,-1.0,0.0,6.25,894.506,1839.899,388.098,61.865,171.994,1263.189,0.0,8.0,4528.527001,1.0
2014-03-24,0.156125,0.2,-0.2,6.4,1073.496,3012.181,532.013,81.808,232.801,0.0,0.0,5.0,2766.648999,0.0
2014-03-25,0.230514,-1.25,0.75,7.75,2806.931,2788.751,459.589,50.375,148.447,0.0,0.0,7.0,4637.388,0.0


In [78]:
instances_1, labels_1 = generate_instances(df_2, columns=['activity', 'circumplex.arousal'])
instances_2, labels_2 = generate_instances(df_2, columns=['activity', 'circumplex.arousal'])
instances = instances_1 + instances_2
labels = labels_1 + labels_2
len(instances)
score = train_and_eval(instances, labels, model='svr')
score

0.6179630949022203

In [66]:
def create_df_per_patient(filename):
    df_patient = pd.read_csv(filename)
    # Edit dataframe
    #df_patient['month'] = df_patient.apply(lambda row: row.time[5:7], axis=1)
    #df_patient['day'] = df_patient.apply(lambda row: row.time[8:10], axis=1)
    #df_patient['call'].fillna(0, inplace=True)

    # Remove rows with no value for mood for whole day
    df_patient = df_patient[is_nan(df_patient['mood']) != True]
    df_patient = df_patient[is_nan(df_patient['screen']) != True]
    df_patient.fillna(0, inplace=True)
    df_patient['time'] = pd.to_datetime(df_patient['time'])
    df_patient.set_index(['time'], inplace=True)
    return df_patient

def generate_instances(df, columns=['mood']):
    number_of_days = 6
    all_instances = []
    all_labels = []
    for date_0, row_0 in df_patient.iterrows():
        valid_data_count = 0
        instance = []
        for date, row in df_patient.iterrows():
            for delta_days in range(1, number_of_days):
                #print(delta_days)
                if date_0 - timedelta(days=delta_days) == date:
                    selected_day_info = list(row[columns])
                    instance += selected_day_info
                    if not is_nan(row['mood']):
                        valid_data_count += 1
                    
        if valid_data_count == number_of_days-1:
            all_labels.append(row_0['mood'])
            all_instances.append(instance)

    assert len(all_labels)==len(all_instances), 'Amount of instances and amount of labels differ.'
    assert len(all_instances) > 0, 'No instances were created.'

    return all_instances, all_labels

def train_and_eval(instances, labels, model='svr'):
    X_train, X_test, y_train, y_test = train_test_split(instances, labels, test_size=0.33, random_state=42)
    if model=='svr':
        # Make instance of model svm.SVR
        clf = svm.SVR().fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    error = mean_squared_error(y_test, y_pred)
    return error

def find_best_features(feature_selection_list, df, filepath, model_selection_list=['svr']):
    # set scoring dict
    score_dict = dict()
    # Loop through feature combinations and models
    for features in feature_selection_list:
        for model in model_selection_list:
            # Generate training instances
            all_instances, all_labels = generate_instances(df, columns=features)
            # Train and evaluate
            score = train_and_eval(all_instances, all_labels, model=model)
            # Write to dictionary
            key = tuple([tuple(features), model])
            score_dict[key] = score
            # Sort dictionary
            sorted_dict = sorted(score_dict.items(), key=lambda x:x[1])
            # Write to file
            with open(filepath, 'w') as outfile:
                for item in sorted_dict:
                    outfile.write(str(item)+'\n')

In [67]:
# # if __name__ == "__main__":
# df_patient = create_df_per_patient(filename_patient)
# possible_features = df_patient.columns
# feature_selection_list = []
# for i in range(1, 3): 
#     feature_selection_list_i = [list(x) for x in itertools.combinations(possible_features, i)]
#     feature_selection_list += feature_selection_list_i
# print(len(feature_selection_list))
# find_best_features(feature_selection_list[:5], df_patient, 'sorted_svr_scores_v4.txt')

In [90]:
filepath = 'sorted_features_total_population_selected.txt'
errors = []
all_instances = []
all_labels = []
model = 'svr'
score_dict = dict()
patient_filenames = glob.glob('./patientData/*')
possible_features = ['circumplex.arousal', 'circumplex.valence', 'mood', 'sms']
feature_selection_list = []
for i in range(1, len(possible_features)+1): 
    feature_selection_list_i = [list(x) for x in itertools.combinations(possible_features, i)]
    feature_selection_list += feature_selection_list_i
print(len(feature_selection_list))
for feature_selection in feature_selection_list:
    print(feature_selection)
    for filename_patient in patient_filenames:
        if feature_selection == ['appCat.utilities'] and filename_patient == './patientData\patientAS14.06.csv':
            continue
        print(filename_patient)
        df_patient = create_df_per_patient(filename_patient)
        #print(df_patient)
        # Generate instances
        all_instances = []
        all_labels = []
        patient_instances, patient_labels = generate_instances(df_patient, columns=feature_selection)
        all_instances += patient_instances
        all_labels += patient_labels

    # Train and evaluate
    score = train_and_eval(all_instances, all_labels, model=model)
    # Write to dictionary
    key = tuple([tuple(feature_selection), model])
    score_dict[key] = score
    print()
# Sort dictionary
sorted_dict = sorted(score_dict.items(), key=lambda x:x[1])
# Write to file
with open(filepath, 'w') as outfile:
    for item in sorted_dict:
        outfile.write(str(item)+'\n')
    
# errors.append(error)
# errors


15
['circumplex.arousal']
./patientData\patientAS14.01.csv
./patientData\patientAS14.02.csv
./patientData\patientAS14.03.csv
./patientData\patientAS14.05.csv
./patientData\patientAS14.06.csv
./patientData\patientAS14.07.csv
./patientData\patientAS14.08.csv
./patientData\patientAS14.09.csv
./patientData\patientAS14.12.csv
./patientData\patientAS14.13.csv
./patientData\patientAS14.14.csv
./patientData\patientAS14.15.csv
./patientData\patientAS14.16.csv
./patientData\patientAS14.17.csv
./patientData\patientAS14.19.csv
./patientData\patientAS14.20.csv
./patientData\patientAS14.23.csv
./patientData\patientAS14.24.csv
./patientData\patientAS14.25.csv
./patientData\patientAS14.26.csv
./patientData\patientAS14.27.csv
./patientData\patientAS14.28.csv
./patientData\patientAS14.29.csv
./patientData\patientAS14.30.csv
./patientData\patientAS14.31.csv
./patientData\patientAS14.32.csv
./patientData\patientAS14.33.csv

['circumplex.valence']
./patientData\patientAS14.01.csv
./patientData\patientAS14.

./patientData\patientAS14.32.csv
./patientData\patientAS14.33.csv

['mood', 'sms']
./patientData\patientAS14.01.csv
./patientData\patientAS14.02.csv
./patientData\patientAS14.03.csv
./patientData\patientAS14.05.csv
./patientData\patientAS14.06.csv
./patientData\patientAS14.07.csv
./patientData\patientAS14.08.csv
./patientData\patientAS14.09.csv
./patientData\patientAS14.12.csv
./patientData\patientAS14.13.csv
./patientData\patientAS14.14.csv
./patientData\patientAS14.15.csv
./patientData\patientAS14.16.csv
./patientData\patientAS14.17.csv
./patientData\patientAS14.19.csv
./patientData\patientAS14.20.csv
./patientData\patientAS14.23.csv
./patientData\patientAS14.24.csv
./patientData\patientAS14.25.csv
./patientData\patientAS14.26.csv
./patientData\patientAS14.27.csv
./patientData\patientAS14.28.csv
./patientData\patientAS14.29.csv
./patientData\patientAS14.30.csv
./patientData\patientAS14.31.csv
./patientData\patientAS14.32.csv
./patientData\patientAS14.33.csv

['circumplex.arousal', 'c

In [91]:
import winsound
frequency = 1700  # Set Frequency To 2500 Hertz
duration = 500  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)