# MDF dataset

In [None]:
import pandas as pd
import numpy as np
import pprint
import os
from tqdm import tqdm

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 20

## Merge dataset

In [None]:
def filter_wifi_scans(folder_path):
    """
    Opens wifi_scans.csv file
    group by time and assign true to a group only if there is at least one row with connected == true
    skips the process if the file wifi_scans2.csv already exists
    """
    if os.path.isfile(folder_path+'/wifi_scans2.csv'):
        # print (f"File {folder_path+'/wifi_scans2.csv'} already exists")
        return
    a = pd.read_csv(folder_path+'/wifi_scans.csv')
    b = a[['time', 'connected']].groupby(['time'], as_index=False).any() # any() returns true if at least one entry is true
    b.to_csv(folder_path+'/wifi_scans2.csv', index=False)

In [None]:
def get_closest_row(file_path, columns, dt):
    """
    finds the row in a dataframe whose time column is closest to dt

    :file_path: CSV file location on disk
    :columns: columns to read when opening the file
    :dt: time in ms
    :return: closest row as numpy array
    """ 
    df = pd.read_csv(file_path, header=0, usecols=['time']+columns) # read only selected CSV columns + time column
    df['time'] = pd.to_datetime(df['time'], unit='ms') # convert from ms to date
    df.sort_values('time', inplace=True)
    df.drop_duplicates(subset='time', keep="first", inplace=True)
    df.set_index('time', inplace=True)
    closest = df.iloc[[df.index.get_loc(dt, method='nearest')]].values[0] # find nearest row to time dt
    return np.asarray(closest)

In [None]:
# dictionary structured as file : columns
file_dict = {'activities.csv': ['in_vehicle', 'on_bicycle', 'on_foot', 'running', 'still', 'tilting', 'walking', 'unknown'], 
             'audio.csv': ['ringer_mode', 'alarm_volume', 'music_volume', 'notifications_volume', 'ring_volume', 'bt_sco_connected', 
                           'mic_muted', 'music_active', 'speaker_on', 'headset_connected'],
             'battery.csv': ['level', 'charging'],
             'display.csv': ['state', 'rotation'],
             'weather.csv': ['weather_id', 'temp', 'temp_min', 'temp_max', 'humidity', 'pressure', 'wind_speed', 'wind_deg', 
                             'clouds', 'rain_last3h', 'snow_last3h', 'sunrise_time', 'sunset_time'],
             'wifi_scans2.csv': ['connected'],
             'location.csv': ['label', 'place_type']}

In [None]:
data_path = 'Datasets/MDF/'
# system apps like launcher,package manager, settings, ota...
ignored_apps = """it.cnr.iit.sensapp com.android.systemui com.sec.android.app.launcher com.android.settings com.android.vending
                  com.android.captiveportallogin com.google.android.packageinstaller com.teslacoilsw.launcher com.android.packageinstaller
                  com.samsung.android.MtpApplication com.sec.android.emergencylauncher com.wssyncmldm com.huawei.android.launcher
                  com.huawei.systemmanager com.asus.launcher android com.asus.ime com.asus.dm com.cyanogenmod.trebuchet
                  org.cyanogenmod.resolver com.android.launcher3 com.oneplus.ota com.samsung.android.game.gametools
                  com.samsung.android.app.galaxyfinder com.huawei.gamebox.global com.sec.android.inputmethod com.android.phone 
                  com.samsung.android.scloud com.huawei.android.internal.app	""".split() 

In [None]:
for user in range(31): # foreach user folder
    user_dir = data_path + 'user_' + str(user)
    filter_wifi_scans(user_dir)

In [None]:
df = pd.DataFrame()   
for user in range(2): # foreach user folder
    print(f"working on user {user}...")
    user_dir = data_path + 'user_' + str(user)
    
    df1 = pd.read_csv(user_dir + '/running_apps.csv', header=0) # read running apps dataframe and use it as a starting point
    df1 = df1[~df1['app'].isin(ignored_apps)]  # ignore system apps
    df1['time'] = pd.to_datetime(df1['time'], unit='ms') # convert date from ms to datetime
    df1.sort_values('time', inplace=True)
    df1.drop_duplicates(subset='time', keep="first", inplace=True)
    df1.reset_index(drop=True, inplace=True)
    df1.insert(1,'user',user) # insert user ID column
    
    rows = []
    for dt in tqdm(df1['time']): # foreach row in running apps dataframe find the closest row in all other dataframes using datetime
        row = []
        for filename, columns in file_dict.items(): # foreach csv file in user folder
            file_path = user_dir + '/' + filename
            row = row + get_closest_row(file_path, columns, dt).tolist() # single row with all the context features
        rows.append(row)

    df2 = pd.DataFrame(rows, columns=np.concatenate(list(file_dict.values()))) # from list of list to dataframe
    df3 = pd.concat([df1, df2], axis=1) # concat by column
    df = pd.concat([df, df3], axis=0) # concat by row
    
df.reset_index(drop=True, inplace=True)
print("done!")

In [None]:
df

In [None]:
# df.info(memory_usage='deep')

In [None]:
# common_apps = """com.whatsapp com.instagram.android com.facebook.katana com.android.chrome com.google.android.youtube""".split()
# a = pd.read_csv(data_path+'user_18/running_apps.csv')
# a = a[~a['app'].isin(ignored_apps+common_apps)]
# a[a.category.isnull()]

## Extract new features

In [111]:
def daytime_from_date(date):
    hour = date.hour
    if hour >= 5 and hour <= 12:
        return 'morning'
    elif hour >= 13 and hour <= 18:
        return 'afternoon'
    elif hour >= 19 and hour <= 22:
        return 'evening'
    else:
        return 'night'
    
def weekday_from_date(date):
    return date.strftime("%A")

In [112]:
df['daytime'] = df['time'].apply(daytime_from_date)
df['weekday'] = df['time'].apply(weekday_from_date)
df.tail(20)

Unnamed: 0,time,user,app,category,in_vehicle,on_bicycle,on_foot,running,still,tilting,walking,unknown,ringer_mode,alarm_volume,music_volume,notifications_volume,ring_volume,bt_sco_connected,mic_muted,music_active,speaker_on,headset_connected,level,charging,state,rotation,weather_id,temp,temp_min,temp_max,humidity,pressure,wind_speed,wind_deg,clouds,rain_last3h,snow_last3h,sunrise_time,sunset_time,connected,label,place_type,daytime,weekday
553,2018-04-21 18:47:50.143,1,com.twitter.android,NEWS_AND_MAGAZINES,0,0,0,0,100,0,0,0,0,0.733333,0.466667,0.0,0.0,False,False,False,False,False,0.1,0.0,1,0,800.0,12.0,12.0,12.0,100.0,1018.0,1.5,130.0,0.0,0.0,0.0,1524371000.0,1524421000.0,False,home,home,afternoon,Saturday
554,2018-04-21 19:02:50.150,1,com.android.mms,,0,0,0,0,100,0,0,0,0,0.733333,0.466667,0.0,0.0,False,False,False,False,False,0.11,1.0,1,0,800.0,12.0,12.0,12.0,100.0,1018.0,1.5,130.0,0.0,0.0,0.0,1524371000.0,1524421000.0,False,home,home,evening,Saturday
555,2018-04-22 08:57:50.630,1,com.twitter.android,NEWS_AND_MAGAZINES,0,0,0,0,100,0,0,0,0,0.733333,0.466667,0.0,0.0,False,False,False,False,False,0.95,0.0,2,0,800.0,14.0,14.0,14.0,93.0,1018.0,1.5,140.0,0.0,0.0,0.0,1524371000.0,1524421000.0,True,home,home,morning,Sunday
556,2018-04-22 09:02:50.635,1,com.twitter.android,NEWS_AND_MAGAZINES,0,0,0,0,100,0,0,0,0,0.733333,0.466667,0.0,0.0,False,False,False,False,False,0.94,0.0,1,0,800.0,14.0,14.0,14.0,93.0,1018.0,1.5,140.0,0.0,0.0,0.0,1524371000.0,1524421000.0,True,home,home,morning,Sunday
557,2018-04-22 10:07:50.679,1,com.instagram.android,SOCIAL,0,0,0,0,100,0,0,0,0,0.733333,1.0,0.0,0.0,False,False,False,False,False,0.92,0.0,1,0,800.0,21.0,21.0,21.0,56.0,1019.0,3.6,270.0,0.0,0.0,0.0,1524371000.0,1524421000.0,True,home,home,morning,Sunday
558,2018-04-22 10:12:50.683,1,com.samsung.android.weather,,0,0,0,0,100,0,0,0,0,0.733333,1.0,0.0,0.0,False,False,False,False,False,0.91,0.0,1,0,800.0,21.0,21.0,21.0,56.0,1019.0,3.6,270.0,0.0,0.0,0.0,1524371000.0,1524421000.0,True,home,home,morning,Sunday
559,2018-04-22 10:17:50.686,1,com.sec.android.app.sbrowser,COMMUNICATION,0,0,0,0,100,0,0,0,0,0.733333,1.0,0.0,0.0,False,False,False,False,False,0.9,0.0,2,0,800.0,21.0,21.0,21.0,56.0,1019.0,3.6,270.0,0.0,0.0,0.0,1524371000.0,1524421000.0,True,home,home,morning,Sunday
560,2018-04-22 10:22:50.687,1,com.instagram.android,SOCIAL,0,0,0,0,0,100,0,0,0,0.733333,1.0,0.0,0.0,False,False,False,False,False,0.9,0.0,1,0,800.0,21.0,21.0,21.0,56.0,1019.0,3.6,270.0,0.0,0.0,0.0,1524371000.0,1524421000.0,True,home,home,morning,Sunday
561,2018-04-22 10:27:50.692,1,com.instagram.android,SOCIAL,0,0,0,0,100,0,0,0,0,0.733333,1.0,0.0,0.0,False,False,False,False,False,0.9,0.0,1,0,800.0,21.0,21.0,21.0,56.0,1019.0,3.6,270.0,0.0,0.0,0.0,1524371000.0,1524421000.0,True,home,home,morning,Sunday
562,2018-04-22 17:32:51.018,1,com.instagram.android,SOCIAL,10,10,10,10,10,0,10,40,0,0.733333,1.0,0.0,0.0,False,False,False,False,False,0.77,0.0,2,0,800.0,20.0,20.0,20.0,68.0,1017.0,4.1,280.0,0.0,0.0,0.0,1524371000.0,1524421000.0,False,home,home,afternoon,Sunday
