In [None]:
import pandas as pd
import numpy as np
import pprint
import os

In [None]:
def get_closest_row(file_path, columns, dt):
    df = pd.read_csv(file_path, header=0)
    df['time'] = pd.to_datetime(df['time'], unit='ms')
    df.sort_values('time', inplace=True)
    df.drop_duplicates(subset='time', keep="first", inplace=True)
    df.set_index('time', inplace=True)
    closest = df.iloc[[df.index.get_loc(dt, method='nearest')]].values[0]
    return np.asarray(closest)

In [None]:
# dictionary with file and columns to be selected
file_dict = {'activities.csv': ['in_veichle', 'on_bicycle', 'on_foot', 'running', 'still', 'tilting', 'walking', 'unknown'], 
             'audio.csv': ['ringer_mode', 'alarm_volume', 'music_volume', 'notifications_volume', 'ring_volume', 'bt_sco_connected', 
                           'mic_muted', 'music_active', 'speaker_on', 'headset_connected'],
             'battery.csv': ['level', 'charging'],
             'display.csv': ['state', 'rotation']}

In [None]:
data_path = 'Datasets/MDF/'
ignored_apps = """it.cnr.iit.sensapp com.android.systemui com.sec.android.app.launcher com.android.settings com.android.vending
                  com.android.captiveportallogin com.google.android.packageinstaller com.teslacoilsw.launcher com.android.packageinstaller
                  com.samsung.android.MtpApplication com.sec.android.emergencylauncher com.wssyncmldm com.huawei.android.launcher
                  com.huawei.systemmanager com.asus.launcher android""".split()

In [None]:
df = pd.DataFrame()   
for user in range(10):
    print(f"working on user {user}...")
    user_dir = data_path + 'user_' + str(user)
    
    # read running apps dataframe and use it as a starting point
    df1 = pd.read_csv(user_dir + '/running_apps.csv', header=0)
    df1 = df1[~df1['app'].isin(ignored_apps)]  # ignore system apps like launcher, updates, MTP...
    df1['time'] = pd.to_datetime(df1['time'], unit='ms') # convert date from ms to datetime
    df1.sort_values('time', inplace=True)
    df1.drop_duplicates(subset='time', keep="first", inplace=True)
    df1.reset_index(drop=True, inplace=True)
    df1.insert(1,'user',user) # insert user ID column
    
    # foreach row in running apps dataframe find the closest row in all other dataframes using datetime
    rows = []
    for dt in df1['time']:
        row = []
        for filename, columns in file_dict.items(): # foreach csv file
            file_path = user_dir + '/' + filename
            row = row + get_closest_row(file_path, columns, dt).tolist() # single row with all the context features
        rows.append(row)

    df2 = pd.DataFrame(rows, columns=np.concatenate(list(file_dict.values()))) # from 2D list to dataframe
    df3 = pd.concat([df1, df2], axis=1) # concat by column
    df = pd.concat([df, df3], axis=0) # concat by row
print("done!")

In [None]:
pd.options.display.max_rows = 10
df

In [None]:
df.info(memory_usage='deep')