In [4]:
import numpy as np
import pandas as pd
import glob
from datetime import timedelta
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
import itertools
import warnings
import math
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
def is_nan(x):
    '''
    Checks if x is nan or not.
    Taken from https://stackoverflow.com/questions/944700/how-can-i-check-for-nan-values.

    :param x: any type of item to check for if it is nan type

    Returns True if it is nan type, false if not.
    '''

    return (x is np.nan or x != x)

In [6]:
filename_patient_1 = './patientData/patientAS14.01.csv'
filename_patient_2 = './patientData/patientAS14.02.csv'

df_1 = create_df_per_patient(filename_patient_1)
df_2 = create_df_per_patient(filename_patient_2)
df_1

Unnamed: 0_level_0,activity,circumplex.arousal,circumplex.valence,mood,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,screen,sms
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2014-03-21,0.13405,0.2,0.2,6.2,3139.218,6280.89,1007.456,49.544,0.0,172.206,239.751,4508.5,915.445,0.0,598.754,0.0,6.0,17978.907,0.0
2014-03-22,0.23688,0.6,0.5,6.4,731.429,4962.918,93.324,21.076,0.0,0.0,98.143,439.632,37.305,0.0,117.621,0.0,3.0,6142.161,1.0
2014-03-23,0.142741,0.2,0.8,6.8,1286.246,5237.319,94.346,43.403,0.0,0.0,72.823,900.839,0.0,0.0,30.086,30.386,0.0,6773.832001,0.0
2014-03-24,0.078961,0.8,0.0,6.0,866.956,9270.629,976.971,34.106,0.0,3.01,66.558,3223.626,419.805,0.0,178.732,0.0,10.0,15047.351001,0.0
2014-03-25,0.098374,0.5,0.5,6.75,1032.768,10276.751,68.206,43.054,0.0,0.0,178.819,1919.471,0.0,235.223,222.893,0.0,0.0,21475.354999,1.0
2014-03-26,0.101308,-0.2,0.6,6.6,1167.497,8988.753,910.479,52.331,0.0,0.0,97.498,4592.059,0.0,0.0,33.365,0.0,0.0,16423.801,0.0
2014-03-27,0.159511,0.2,0.8,7.0,1229.327,6936.512,639.339,42.219,0.0,182.451,58.532,935.381,47.314,0.0,179.029,0.0,2.0,17442.149999,1.0
2014-03-28,0.095698,-0.6,0.6,6.4,10062.595,866.833,1005.403,89.166,233.036,0.0,225.951,512.741,1133.009,0.0,301.717,0.0,5.0,4923.489,0.0
2014-03-29,0.068203,0.2,1.0,8.0,1952.63,2720.692,1627.429,0.0,0.0,0.0,169.594,472.888,52.435,0.0,600.637,0.0,4.0,8322.622,1.0
2014-03-30,0.049093,-0.5,0.75,7.5,414.365,1298.505,1535.677,0.0,0.0,0.0,74.003,167.685,0.0,66.477,38.296,0.0,0.0,4523.214001,0.0


In [78]:
instances_1, labels_1 = generate_instances(df_2, columns=['activity', 'circumplex.arousal'])
instances_2, labels_2 = generate_instances(df_2, columns=['activity', 'circumplex.arousal'])
instances = instances_1 + instances_2
labels = labels_1 + labels_2
len(instances)
score = train_and_eval(instances, labels, model='svr')
score

0.6179630949022203

In [13]:
def create_df_per_patient(filename):
    df_patient = pd.read_csv(filename)
    # Edit dataframe
    #df_patient['month'] = df_patient.apply(lambda row: row.time[5:7], axis=1)
    #df_patient['day'] = df_patient.apply(lambda row: row.time[8:10], axis=1)
    #df_patient['call'].fillna(0, inplace=True)

    # Remove rows with no value for mood for whole day
    df_patient = df_patient[is_nan(df_patient['mood']) != True]
    df_patient = df_patient[is_nan(df_patient['screen']) != True]
    df_patient.fillna(0, inplace=True)
    df_patient['time'] = pd.to_datetime(df_patient['time'])
    df_patient.set_index(['time'], inplace=True)
    return df_patient

def generate_instances(df, columns=['mood']):
    number_of_days = 6
    all_instances = []
    all_labels = []
    for date_0, row_0 in df.iterrows():
        valid_data_count = 0
        instance = []
        for date, row in df.iterrows():
            for delta_days in range(1, number_of_days):
                #print(delta_days)
                if date_0 - timedelta(days=delta_days) == date:
                    selected_day_info = list(row[columns])
                    instance += selected_day_info
                    if not is_nan(row['mood']):
                        valid_data_count += 1
                    
        if valid_data_count == number_of_days-1:
            all_labels.append(row_0['mood'])
            all_instances.append(instance)

    assert len(all_labels)==len(all_instances), 'Amount of instances and amount of labels differ.'
    assert len(all_instances) > 0, 'No instances were created.'

    return all_instances, all_labels

def train_and_eval(instances, labels, model='svr'):
    X_train, X_test, y_train, y_test = train_test_split(instances, labels, test_size=0.33, random_state=42, shuffle=False)
    if model=='svr':
    # Make instance of model svm.SVR
        clf = svm.SVR().fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    error = math.sqrt(mean_squared_error(y_test, y_pred))
    print('rmse:', error)
    #print('accuracy:', accuracy_score(y_test, y_pred))
    return error

def find_best_features(feature_selection_list, df, filepath, model_selection_list=['svr']):
    # set scoring dict
    score_dict = dict()
    # Loop through feature combinations and models
    for features in feature_selection_list:
        for model in model_selection_list:
            # Generate training instances
            all_instances, all_labels = generate_instances(df, columns=features)
            # Train and evaluate
            score = train_and_eval(all_instances, all_labels, model=model)
            # Write to dictionary
            key = tuple([tuple(features), model])
            score_dict[key] = score
            # Sort dictionary
            sorted_dict = sorted(score_dict.items(), key=lambda x:x[1])
            # Write to file
            with open(filepath, 'w') as outfile:
                for item in sorted_dict:
                    outfile.write(str(item)+'\n')

In [9]:
# # if __name__ == "__main__":
# df_patient = create_df_per_patient(filename_patient)
# possible_features = df_patient.columns
# feature_selection_list = []
# for i in range(1, 3): 
#     feature_selection_list_i = [list(x) for x in itertools.combinations(possible_features, i)]
#     feature_selection_list += feature_selection_list_i
# print(len(feature_selection_list))
# find_best_features(feature_selection_list[:5], df_patient, 'sorted_svr_scores_v4.txt')

In [10]:
# For all patients
errors = []
all_instances = []
all_labels = []
model = 'svr'
score_dict = dict()
patient_filenames = glob.glob('./patientData/*')
features = ['circumplex.arousal', 'circumplex.valence', 'mood', 'sms']

for filename_patient in patient_filenames:
    print(filename_patient)
    df_patient = create_df_per_patient(filename_patient)
    #print(df_patient)
    # Generate instances
    all_instances = []
    all_labels = []
    patient_instances, patient_labels = generate_instances(df_patient, columns=features)
    all_instances += patient_instances
    all_labels += patient_labels

# Train and evaluate
score = train_and_eval(all_instances, all_labels, model=model)
score

    
# errors.append(error)
# errors


./patientData\patientAS14.01.csv
./patientData\patientAS14.02.csv
./patientData\patientAS14.03.csv
./patientData\patientAS14.05.csv
./patientData\patientAS14.06.csv
./patientData\patientAS14.07.csv
./patientData\patientAS14.08.csv
./patientData\patientAS14.09.csv
./patientData\patientAS14.12.csv
./patientData\patientAS14.13.csv
./patientData\patientAS14.14.csv
./patientData\patientAS14.15.csv
./patientData\patientAS14.16.csv
./patientData\patientAS14.17.csv
./patientData\patientAS14.19.csv
./patientData\patientAS14.20.csv
./patientData\patientAS14.23.csv
./patientData\patientAS14.24.csv
./patientData\patientAS14.25.csv
./patientData\patientAS14.26.csv
./patientData\patientAS14.27.csv
./patientData\patientAS14.28.csv
./patientData\patientAS14.29.csv
./patientData\patientAS14.30.csv
./patientData\patientAS14.31.csv
./patientData\patientAS14.32.csv
./patientData\patientAS14.33.csv
rmse: 0.6498125151119458


0.42225630479611287

In [14]:
errors = []
# For each individual patient
for filename_patient in patient_filenames:
    print(filename_patient)
    df_patient = create_df_per_patient(filename_patient)
    #print(df_patient)
    # Generate instances
    all_instances = []
    all_labels = []
    patient_instances, patient_labels = generate_instances(df_patient, columns=features)

    # Train and evaluate
    score = train_and_eval(patient_instances, patient_labels, model=model)
    errors.append(score)
    print(score)

np.mean(errors)    
# errors.append(error)

./patientData\patientAS14.01.csv
rmse: 0.7517670470815608
0.7517670470815608
./patientData\patientAS14.02.csv
rmse: 0.603261429755035
0.603261429755035
./patientData\patientAS14.03.csv
rmse: 0.3823690645287424
0.3823690645287424
./patientData\patientAS14.05.csv
rmse: 0.45845785481705414
0.45845785481705414
./patientData\patientAS14.06.csv
rmse: 0.5934452008262131
0.5934452008262131
./patientData\patientAS14.07.csv
rmse: 1.1353873656187814
1.1353873656187814
./patientData\patientAS14.08.csv
rmse: 0.6246241880307185
0.6246241880307185
./patientData\patientAS14.09.csv
rmse: 0.6844435684301842
0.6844435684301842
./patientData\patientAS14.12.csv
rmse: 0.6191704145510807
0.6191704145510807
./patientData\patientAS14.13.csv
rmse: 1.2848724912488334
1.2848724912488334
./patientData\patientAS14.14.csv
rmse: 0.48719726434924077
0.48719726434924077
./patientData\patientAS14.15.csv
rmse: 0.632326853750161
0.632326853750161
./patientData\patientAS14.16.csv
rmse: 0.8647420833079895
0.8647420833079895

0.6159976169093043

In [15]:
import winsound
frequency = 1700  # Set Frequency To 2500 Hertz
duration = 500  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)