# MDF dataset

In [None]:
import pandas as pd
import numpy as np
import pprint
import os
from tqdm import tqdm
import holidays

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 20

## Merge datasets

In [None]:
def fix_nan_category(df):
    df.loc[df['app'].str.contains('camera'), 'category'] = 'PHOTOGRAPHY' # change category from NaN
    df.loc[df['app'].str.contains('camera'), 'app'] = 'camera'  # change app name, all camera apps from various brands are equivalent
    
    df.loc[df['app'].str.contains('com.android.incallui'), 'category'] = 'COMMUNICATION' # incallui is the interface during a call
    
    df.loc[df['app'].str.contains('mail'), 'category'] = 'PRODUCTIVITY'
    df.loc[df['app'].str.contains('com.google.android.gm'), 'category'] = 'PRODUCTIVITY' # change gmail category from communication to productivity
    
    df.loc[df['app'].str.contains('gallery'), 'category'] = 'PHOTOGRAPHY' # change category from NaN
    df.loc[df['app'].str.contains('gallery'), 'app'] = 'gallery'  # change app name, all gallery apps from various brands are equivalent

In [None]:
def filter_wifi_scans(folder_path):
    """
    Opens wifi_scans.csv file
    group by time and assign true to a group only if there is at least one row with connected == true
    skips the process if the file wifi_scans2.csv already exists
    """
    if os.path.isfile(folder_path+'/wifi_scans2.csv'):
        # print (f"File {folder_path+'/wifi_scans2.csv'} already exists")
        return
    a = pd.read_csv(folder_path+'/wifi_scans.csv')
    b = a[['time', 'connected']].groupby(['time'], as_index=False).any() # any() returns true if at least one entry is true
    b.to_csv(folder_path+'/wifi_scans2.csv', index=False)

In [None]:
def get_closest_row(file_path, columns, dt):
    """
    finds the row in a dataframe whose time column is closest to dt

    :file_path: CSV file location on disk
    :columns: columns to read when opening the file
    :dt: time in ms
    :return: closest row as numpy array
    """ 
    df = pd.read_csv(file_path, header=0, usecols=['time']+columns) # read only selected CSV columns + time column
    df['time'] = pd.to_datetime(df['time'], unit='ms') # convert from ms to date
    df.sort_values('time', inplace=True)
    df.drop_duplicates(subset='time', keep="first", inplace=True)
    df.set_index('time', inplace=True)
    closest = df.iloc[[df.index.get_loc(dt, method='nearest')]].values[0] # find nearest row to time dt
    return np.asarray(closest)

In [None]:
# dictionary structured as file : columns
file_dict = {'activities.csv': ['in_vehicle', 'on_bicycle', 'on_foot', 'running', 'still', 'tilting', 'walking', 'unknown'], 
             'audio.csv': ['ringer_mode', 'alarm_volume', 'music_volume', 'notifications_volume', 'ring_volume', 'music_active', 'speaker_on', 'headset_connected'],
             'battery.csv': ['level', 'charging'],
             'display.csv': ['state', 'rotation'],
             'weather.csv': ['temp', 'humidity', 'pressure', 'wind_speed', 'wind_deg',  'clouds', 'rain_last3h'],
             'wifi_scans2.csv': ['connected'],
             'location.csv': ['label', 'place_type']}

In [None]:
data_path = 'Datasets/MDF/'
# system apps like launcher,package manager, settings, ota...
ignored_apps = """it.cnr.iit.sensapp com.android.systemui com.sec.android.app.launcher com.android.settings com.android.vending
                  com.android.captiveportallogin com.google.android.packageinstaller com.teslacoilsw.launcher com.android.packageinstaller
                  com.samsung.android.MtpApplication com.sec.android.emergencylauncher com.wssyncmldm com.huawei.android.launcher
                  com.huawei.systemmanager com.asus.launcher android com.asus.ime com.asus.dm com.cyanogenmod.trebuchet
                  org.cyanogenmod.resolver com.android.launcher3 com.oneplus.ota com.samsung.android.game.gametools
                  com.samsung.android.app.galaxyfinder com.huawei.gamebox.global com.sec.android.inputmethod com.android.phone 
                  com.samsung.android.scloud com.huawei.android.internal.app com.miui.home com.android.providers.downloads.ui
                  com.android.printspooler com.lge.launcher3 com.lge.phonemanagement com.lge.bluetoothsetting com.lge.wifisettings
                  com.lge.homeselector com.lge.launcher2 com.lge.lockscreensettings it.cnr.iit.contextlabeler
                  com.sec.android.preloadinstaller com.android.server.telecom com.asus.powersaver com.android.stk
                  it.cnr.iit.mymoviedb """.split() 

In [None]:
for user in range(31): # foreach user folder
    user_dir = data_path + 'user_' + str(user)
    filter_wifi_scans(user_dir)

In [None]:
df = pd.DataFrame()   
for user in list(range(0,27)) + list(range(28,31)): # foreach user folder, skip user 27 it doesn't works for some reasons
    print(f"working on user {user}...")
    user_dir = data_path + 'user_' + str(user)
    
    df1 = pd.read_csv(user_dir + '/running_apps.csv', header=0) # read running apps dataframe and use it as a starting point
    df1 = df1[~df1['app'].isin(ignored_apps)]  # ignore system apps
    fix_nan_category(df1)
    df1 = df1[~df1.app.str.contains("samsung|huawei|lge|asus|xiaomi|cyanogenmod")] # ignore brand apps
    df1 = df1[~df1.category.isnull()]  # ignore apps with NaN category
    df1['time'] = pd.to_datetime(df1['time'], unit='ms') # convert date from ms to datetime
    df1.sort_values('time', inplace=True)
    df1.drop_duplicates(subset='time', keep="first", inplace=True)
    df1.reset_index(drop=True, inplace=True)
    df1.insert(1,'user',user) # insert user ID column
    
    rows = []
    for dt in tqdm(df1['time']): # foreach row in running apps dataframe find the closest row in all other dataframes using datetime
        row = []
        for filename, columns in file_dict.items(): # foreach csv file in user folder
            file_path = user_dir + '/' + filename
            row = row + get_closest_row(file_path, columns, dt).tolist() # single row with all the context features
        rows.append(row)

    df2 = pd.DataFrame(rows, columns=np.concatenate(list(file_dict.values()))) # from list of list to dataframe
    df3 = pd.concat([df1, df2], axis=1) # concat by column
    df = pd.concat([df, df3], axis=0) # concat by row
    
df.reset_index(drop=True, inplace=True)
print("done!")

## Extract new features

In [None]:
def daytime_from_date(date):
    hour = date.hour
    if hour >= 5 and hour <= 12:
        return 'morning'
    elif hour >= 13 and hour <= 18:
        return 'afternoon'
    elif hour >= 19 and hour <= 22:
        return 'evening'
    else:
        return 'night'
    
def weekday_from_date(date):
    return date.strftime("%A")

def is_weekend(weekday:str):
    return True if weekday == 'Saturday' or weekday == 'Sunday' else False

it_holidays = holidays.Italy()

def is_holiday(date):
    return date in it_holidays

In [None]:
df['daytime'] = df['time'].apply(daytime_from_date)
df['weekday'] = df['time'].apply(weekday_from_date)
df['is_weekend'] = df['weekday'].apply(is_weekend)
df['is_holiday'] = df['time'].apply(is_holiday)

In [39]:
df

Unnamed: 0,time,user,item,category,in_vehicle,on_bicycle,on_foot,running,still,tilting,walking,unknown,ringer_mode,alarm_volume,music_volume,notifications_volume,ring_volume,music_active,speaker_on,headset_connected,level,charging,state,rotation,temp,humidity,pressure,wind_speed,wind_deg,clouds,rain_last3h,connected,label,place_type,daytime,weekday,is_weekend,is_holiday,temp_cold,temp_coldish,temp_hot,temp_very cold,temp_warm
0,2018-04-24 06:07:19.466,0,0,COMMUNICATION,0.08,0.09,0.14,0.08,0.23,0.0,0.14,0.31,2,0.733333,0.0,0.0,1.0,False,False,False,charged,0.0,2,0,coldish,93.0,1019.00,1.00,0.000,0.0,0.0,False,free_time,outdoors,morning,Tuesday,False,False,0,1,0,0,0
1,2018-04-24 06:12:19.487,0,1,NEWS_AND_MAGAZINES,0.08,0.07,0.35,0.07,0.09,0.0,0.35,0.28,2,0.733333,0.0,0.0,1.0,False,False,False,charged,0.0,2,0,coldish,93.0,1019.00,1.00,0.000,0.0,0.0,False,free_time,outdoors,morning,Tuesday,False,False,0,1,0,0,0
2,2018-04-24 06:17:19.502,0,2,COMMUNICATION,0.27,0.08,0.08,0.08,0.11,0.0,0.08,0.31,1,0.733333,0.0,0.0,0.0,False,False,False,quite charged,0.0,1,0,coldish,93.0,1019.00,1.00,0.000,0.0,0.0,False,free_time,outdoors,morning,Tuesday,False,False,0,1,0,0,0
3,2018-04-24 06:57:19.368,0,2,COMMUNICATION,0.07,0.08,0.13,0.05,0.39,0.0,0.13,0.21,1,0.733333,0.0,0.0,0.0,False,False,False,quite charged,0.0,1,0,coldish,93.0,1019.00,1.00,0.000,0.0,0.0,False,external_school,workplace,morning,Tuesday,False,False,0,1,0,0,0
4,2018-04-24 07:42:19.457,0,3,MUSIC_AND_AUDIO,0.16,0.10,0.10,0.07,0.18,0.0,0.10,0.28,1,0.733333,0.0,0.0,0.0,False,False,False,quite charged,0.0,1,0,coldish,93.0,1019.00,1.00,0.000,0.0,0.0,True,external_school,workplace,morning,Tuesday,False,False,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26948,2018-05-28 11:24:19.348,30,7,SOCIAL,0.10,0.01,0.04,0.00,0.72,0.0,0.04,0.03,2,1.000000,0.6,0.0,0.0,False,False,False,half charged,0.0,1,0,hot,87.0,1019.26,2.87,231.502,68.0,0.0,False,workplace,workplace,morning,Monday,False,False,0,0,1,0,0
26949,2018-05-28 11:29:19.366,30,7,SOCIAL,0.00,0.00,0.00,0.00,1.00,0.0,0.00,0.00,2,1.000000,0.6,0.0,0.0,False,False,False,half charged,0.0,1,0,hot,87.0,1019.26,2.87,231.502,68.0,0.0,False,workplace,workplace,morning,Monday,False,False,0,0,1,0,0
26950,2018-05-28 12:11:22.297,30,2,COMMUNICATION,0.00,0.00,0.00,0.00,1.00,0.0,0.00,0.00,2,1.000000,0.6,0.0,0.0,False,False,False,half charged,0.0,2,0,hot,87.0,1019.26,2.87,231.502,68.0,0.0,False,workplace,workplace,morning,Monday,False,False,0,0,1,0,0
26951,2018-05-28 12:17:08.615,30,7,SOCIAL,0.00,0.00,0.00,0.00,1.00,0.0,0.00,0.00,2,1.000000,0.6,0.0,0.0,False,False,False,half charged,0.0,2,0,hot,87.0,1019.26,2.87,231.502,68.0,0.0,False,workplace,workplace,morning,Monday,False,False,0,0,1,0,0


## Encoding
### Fix labels

**place type**: group similar labels under a more general labels (es. food, restaurant and bar under food label)

In [None]:
df.loc[df['place_type'].isin(['restaurant', 'bar', 'cafe', 'food']), 'place_type'] = 'food_and_drink'
df.loc[df['place_type'].isin(['route', 'street', 'park', 'tourist_attraction']), 'place_type'] = 'outdoors'
df.loc[df['place_type'].isin(['transit_station', 'bus_station', 'taxi_stand']), 'place_type'] = 'public_transport_station'
df.loc[df['place_type'].isin(['supermarket', 'home_goods', 'bakery', 'shopping_mall', 'library', 'book_store', 'florist']), 'place_type'] = 'store'
df.loc[df['place_type'].isin(['health', 'doctor']), 'place_type'] = 'health'
df.loc[df['place_type'].isin(['finance', 'gas_station', 'general_contractor', 'bank', 'premise', 'lawyer', 'insurance_agency', 'hair_care', 'city_hall', 'plumber', 'pharmacy', 'police', 'veterinary', 'laundry', 'place_of_worship', 'university', 'moving_company', 'post_office', 'car_repair', 'real_estate_agency', 'painter', 'car_wash', 'local_government_office', 'beauty_salon', 'electrician', 'car_rental', 'funeral_home', 'fire_station', 'travel_agency']), 'place_type'] = 'service'

**category**: group all GAME subcategories under GAME label

In [None]:
df.loc[df['category'].str.contains('GAME'), 'category'] = 'GAME'
df.loc[df['category'].isin([' COMMUNICATION']), 'category'] = 'COMMUNICATION' # fix communication category with space at the beginning

### App
Convert **app** from package name to unique IDs and rename to item

In [None]:
df.app = pd.factorize(df.app)[0]
df = df.rename(columns={'app': 'item'})

### Activities
**in_vehicle, on_bicycle, on_foot, running, still, tilting, walking, unknown** represent the probability from 0 to 100 that the user is doing that activity. We normalize these features.

In [None]:
activities = 'in_vehicle on_bicycle on_foot running still tilting walking unknown'.split()
df[activities] = df[activities].apply(lambda x: x/100)

### Battery
Battery **level** goes from 0 to 1, where 1 is full charged, we encode it as a categorical variable

In [None]:
def get_battery_status(lv):
    lv = lv * 100
    if lv >= 80:
        return 'charged'
    elif lv >= 60 and lv < 80:
        return 'quite charged'
    elif lv >= 40 and lv < 60:
        return 'half charged'
    elif lv >= 20 and lv < 40:
        return 'low'
    else:
        return 'very low'

df['level'] = df['level'].apply(get_battery_status)

### Weather
- Temperature **temp** is encoded as a categorical variable
- **humidity, pressure, wind_speed, wind_deg** are normalized
- **clouds**, **rain_last3h** ---> sono valori numerici quindi boh ???

In [None]:
def get_temperature(degree):
    if degree <= 5:
        return 'very_cold'
    elif degree > 5 and degree <= 10:
        return 'cold'
    elif degree > 10 and degree <= 15:
        return 'coldish'
    elif degree > 15 and degree <= 20:
        return 'warm'
    elif degree > 20 and degree <= 30:
        return 'hot'
    else:
        return 'very_hot'
        
df['temp'] = df['temp'].apply(get_temperature)
df = pd.concat((df, pd.get_dummies(df['temp'], prefix='temp')), axis=1)

## Plots

In [None]:
df['place_type'].value_counts().plot.bar(figsize=(20,5))


In [None]:
df['category'].value_counts().plot.bar(figsize=(20,5))

In [None]:
df['label'].value_counts().plot.bar(figsize=(20,5))

## Save to CSV

In [None]:
df.to_csv('MDF_final.csv', index=False)