In [1]:
import pandas as pd
import numpy as np
import pprint
import os

In [2]:
def get_closest_row(file_path, columns, dt):
    df = pd.read_csv(file_path, header=0)
    df['time'] = pd.to_datetime(df['time'], unit='ms')
    df.sort_values('time', inplace=True)
    df.drop_duplicates(subset='time', keep="first", inplace=True)
    df.set_index('time', inplace=True)
    closest = df.iloc[[df.index.get_loc(dt, method='nearest')]].values[0]
    return np.asarray(closest)

In [3]:
# dictionary with file and columns to be selected
file_dict = {'activities.csv': ['in_veichle', 'on_bicycle', 'on_foot', 'running', 'still', 'tilting', 'walking', 'unknown'], 
             'audio.csv': ['ringer_mode', 'alarm_volume', 'music_volume', 'notifications_volume', 'ring_volume', 'bt_sco_connected', 
                           'mic_muted', 'music_active', 'speaker_on', 'headset_connected'],
             'battery.csv': ['level', 'charging'],
             'display.csv': ['state', 'rotation']}

In [4]:
data_path = 'Datasets/MDF/'
ignored_apps = """it.cnr.iit.sensapp com.android.systemui com.sec.android.app.launcher com.android.settings com.android.vending
                  com.android.captiveportallogin com.google.android.packageinstaller com.teslacoilsw.launcher com.android.packageinstaller
                  com.samsung.android.MtpApplication com.sec.android.emergencylauncher com.wssyncmldm com.huawei.android.launcher
                  com.huawei.systemmanager com.asus.launcher android""".split()

In [5]:
df = pd.DataFrame()   
for user in range(10):
    print(f"working on user {user}...")
    user_dir = data_path + 'user_' + str(user)
    
    # read running apps dataframe and use it as a starting point
    df1 = pd.read_csv(user_dir + '/running_apps.csv', header=0)
    df1 = df1[~df1['app'].isin(ignored_apps)]  # ignore system apps like launcher, updates, MTP...
    df1['time'] = pd.to_datetime(df1['time'], unit='ms') # convert date from ms to datetime
    df1.sort_values('time', inplace=True)
    df1.drop_duplicates(subset='time', keep="first", inplace=True)
    df1.reset_index(drop=True, inplace=True)
    df1.insert(1,'user',user) # insert user ID column
    
    # foreach row in running apps dataframe find the closest row in all other dataframes using datetime
    rows = []
    for dt in df1['time']:
        row = []
        for filename, columns in file_dict.items(): # foreach csv file
            file_path = user_dir + '/' + filename
            row = row + get_closest_row(file_path, columns, dt).tolist() # single row with all the context features
        rows.append(row)

    df2 = pd.DataFrame(rows, columns=np.concatenate(list(file_dict.values()))) # from 2D list to dataframe
    df3 = pd.concat([df1, df2], axis=1) # concat by column
    df = pd.concat([df, df3], axis=0) # concat by row
print("done!")

working on user 0...
working on user 1...
working on user 2...
working on user 3...
working on user 4...
working on user 5...
working on user 6...
working on user 7...
working on user 8...
working on user 9...
done!


In [6]:
pd.options.display.max_rows = 10
df

Unnamed: 0,time,user,app,category,in_veichle,on_bicycle,on_foot,running,still,tilting,...,ring_volume,bt_sco_connected,mic_muted,music_active,speaker_on,headset_connected,level,charging,state,rotation
0,2018-04-24 06:07:19.466,0,com.android.chrome,COMMUNICATION,8,9,14,8,23,0,...,1.0,False,False,False,False,False,0.82,0.0,2,0
1,2018-04-24 06:12:19.487,0,com.twitter.android,NEWS_AND_MAGAZINES,8,7,35,7,9,0,...,1.0,False,False,False,False,False,0.80,0.0,2,0
2,2018-04-24 06:17:19.502,0,com.whatsapp,COMMUNICATION,27,8,8,8,11,0,...,0.0,False,False,False,False,False,0.77,0.0,1,0
3,2018-04-24 06:57:19.368,0,com.whatsapp,COMMUNICATION,7,8,13,5,39,0,...,0.0,False,False,False,False,False,0.74,0.0,1,0
4,2018-04-24 07:42:19.457,0,com.spotify.music,MUSIC_AND_AUDIO,16,10,10,7,18,0,...,0.0,False,False,False,False,False,0.72,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,2018-04-24 06:35:00.851,9,com.android.launcher3,,25,8,8,8,8,0,...,0.0,False,False,True,False,True,0.88,0.0,2,0
69,2018-04-24 06:40:00.853,9,com.spotify.musid,,27,9,14,5,17,0,...,0.0,False,False,True,False,True,0.87,0.0,1,0
70,2018-04-24 06:50:00.697,9,com.spotify.musid,,1,6,88,0,2,0,...,0.0,False,False,True,False,True,0.86,0.0,2,0
71,2018-04-24 07:12:25.843,9,com.android.launcher3,,6,3,22,0,65,0,...,0.0,False,False,False,False,False,0.84,0.0,2,0


In [8]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7921 entries, 0 to 72
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   time                  7921 non-null   datetime64[ns]
 1   user                  7921 non-null   int64         
 2   app                   7921 non-null   object        
 3   category              7024 non-null   object        
 4   in_veichle            7921 non-null   int64         
 5   on_bicycle            7921 non-null   int64         
 6   on_foot               7921 non-null   int64         
 7   running               7921 non-null   int64         
 8   still                 7921 non-null   int64         
 9   tilting               7921 non-null   int64         
 10  walking               7921 non-null   int64         
 11  unknown               7921 non-null   int64         
 12  ringer_mode           7921 non-null   int64         
 13  alarm_volume        