In [68]:
import pandas as pd
import numpy as np

### Context functions

Functions to extract context features from raw files

In [66]:
def get_activity(user_dir, dt):
    """
    Returns: normalized vector of activities
    Features: in_veichle, on_bicycle, on_foot, running, still, tilting, walking, unknown
    """
    df = pd.read_csv(user_dir + '/activities.csv', header=0)
    df['time'] = pd.to_datetime(df['time'], unit='ms')
    df.sort_values('time', inplace=True)
    df.drop_duplicates(subset='time', keep="first", inplace=True)
    df.set_index('time', inplace=True)
    
    # The max value of each feature is 100
    return df.iloc[[df.index.get_loc(dt, method='nearest')]].values[0]/100.0


def get_audio_status(user_dir, dt):
    """
    Returns: audio features
    Features: ringer_mode, alarm_volume, music_volume, notifications_volume, ring_volume, bt_sco_connected, mic_muted, music_active, speaker_on, headset_connected
    """
    df = pd.read_csv(user_dir + '/audio.csv', header=0)
    df['time'] = pd.to_datetime(df['time'], unit='ms')
    df.sort_values('time', inplace=True)
    df.drop_duplicates(subset='time', keep="first", inplace=True)
    df.set_index('time', inplace=True)
    
    # Find the closest row
    closest = df.iloc[[df.index.get_loc(dt, method='nearest')]].values[0].tolist()
    
    # Convert boolean into int, if necessary
    closest = [0 if e == False else 1 if e == True else e for e in closest]

    return np.asarray(closest)

### Process users' data

In [2]:
DATA_SOURCE = '../Datasets/MDF/'

# Nomi colonne
header = []

for user in list(range(31)):
    user_dir = DATA_SOURCE + 'user_' + str(user)
    
    # We use activities log as starting point
    df = pd.read_csv(user_dir + '/activities.csv', header=0)
    df['time'] = pd.to_datetime(df['time'], unit='ms')
    df.sort_values('time', inplace=True)
    df.drop_duplicates(subset='time', keep="first", inplace=True)
    
    
    rows = []
    
    # For each timestamp, get the closest sensors data
    for dt in df['time']:
        activities = get_activity(user_dir=user_dir, dt=dt)
        audio_status = get_audio_status(user_dir, df['time'][0])
        
        row = activities.tolist() + audio_status.tolist()
        rows.append(row)
        
    pd.DataFrame(columns=)