# MDF dataset

In [52]:
import pandas as pd
import numpy as np
import pprint
import os
from tqdm import tqdm
import holidays
from sklearn.preprocessing import MinMaxScaler
import random

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 20

## Merge datasets

In [53]:
def fix_nan_category(df):
    """
    System apps like camera and gallery have NaN category, and different name for the same app (ex. samsung camera and huawei camera)
    This function fix the category and assign a common name to system apps
    """
    df.loc[df['app'].str.contains('camera'), 'category'] = 'PHOTOGRAPHY' # change category from NaN
    df.loc[df['app'].str.contains('camera'), 'app'] = 'camera'  # change app name, all camera apps from various brands are equivalent
    
    df.loc[df['app'].str.contains('com.android.incallui'), 'category'] = 'COMMUNICATION' # incallui is the interface during a call
    
    df.loc[df['app'].str.contains('mail'), 'category'] = 'PRODUCTIVITY'
    df.loc[df['app'].str.contains('com.google.android.gm'), 'category'] = 'PRODUCTIVITY' # change gmail category from communication to productivity
    
    df.loc[df['app'].str.contains('gallery'), 'category'] = 'PHOTOGRAPHY' # change category from NaN
    df.loc[df['app'].str.contains('gallery'), 'app'] = 'gallery'  # change app name, all gallery apps from various brands are equivalent

In [54]:
def filter_wifi_scans(folder_path):
    """
    Opens wifi_scans.csv file
    group by time and assign true to a group only if there is at least one row with connected == true
    skips the process if the file wifi_scans2.csv already exists
    """
    if os.path.isfile(folder_path+'/wifi_scans2.csv'):
        return
    a = pd.read_csv(folder_path+'/wifi_scans.csv')
    b = a[['time', 'connected']].groupby(['time'], as_index=False).any() # any() returns true if at least one entry is true
    b.to_csv(folder_path+'/wifi_scans2.csv', index=False)

In [55]:
def get_closest_row(file_path, columns, dt):
    """
    finds the row in a dataframe whose time column is closest to dt

    :file_path: CSV file location on disk
    :columns: columns to read when opening the file
    :dt: time in ms
    :return: closest row as numpy array
    """ 
    df = pd.read_csv(file_path, header=0, usecols=['time']+columns) # read only selected CSV columns + time column
    df['time'] = pd.to_datetime(df['time'], unit='ms') # convert from ms to date
    df.sort_values('time', inplace=True)
    df.drop_duplicates(subset='time', keep="first", inplace=True)
    df.set_index('time', inplace=True)
    closest = df.iloc[[df.index.get_loc(dt, method='nearest')]].values[0] # find nearest row to time dt
    return np.asarray(closest)

In [56]:
# dictionary structured as file : columns
file_dict = {'activities.csv': ['in_vehicle', 'on_bicycle', 'on_foot', 'running', 'still', 'tilting', 'walking', 'unknown'], 
             'audio.csv': ['ringer_mode', 'alarm_volume', 'music_volume', 'notifications_volume', 'ring_volume', 'music_active', 'speaker_on', 'headset_connected'],
             'battery.csv': ['level', 'charging'],
             'display.csv': ['state', 'rotation'],
             'weather.csv': ['temp', 'humidity', 'pressure', 'wind_speed', 'wind_deg',  'clouds', 'rain_last3h'],
             'wifi_scans2.csv': ['connected'],
             'location.csv': ['label', 'place_type']}

In [57]:
data_path = 'Datasets/MDF/'
# system apps like launcher,package manager, settings, ota...
ignored_apps = """it.cnr.iit.sensapp com.android.systemui com.sec.android.app.launcher com.android.settings com.android.vending
                  com.android.captiveportallogin com.google.android.packageinstaller com.teslacoilsw.launcher com.android.packageinstaller
                  com.samsung.android.MtpApplication com.sec.android.emergencylauncher com.wssyncmldm com.huawei.android.launcher
                  com.huawei.systemmanager com.asus.launcher android com.asus.ime com.asus.dm com.cyanogenmod.trebuchet
                  org.cyanogenmod.resolver com.android.launcher3 com.oneplus.ota com.samsung.android.game.gametools
                  com.samsung.android.app.galaxyfinder com.huawei.gamebox.global com.sec.android.inputmethod com.android.phone 
                  com.samsung.android.scloud com.huawei.android.internal.app com.miui.home com.android.providers.downloads.ui
                  com.android.printspooler com.lge.launcher3 com.lge.phonemanagement com.lge.bluetoothsetting com.lge.wifisettings
                  com.lge.homeselector com.lge.launcher2 com.lge.lockscreensettings it.cnr.iit.contextlabeler
                  com.sec.android.preloadinstaller com.android.server.telecom com.asus.powersaver com.android.stk
                  it.cnr.iit.mymoviedb """.split() 

In [58]:
for user in range(31): # foreach user folder
    user_dir = data_path + 'user_' + str(user)
    filter_wifi_scans(user_dir)

In [59]:
df = pd.DataFrame()   
for user in list(range(0,27)) + list(range(28,31)): # foreach user folder, skip user 27 it doesn't works for some reasons
    print(f"working on user {user}...")
    user_dir = data_path + 'user_' + str(user)
    
    df1 = pd.read_csv(user_dir + '/running_apps.csv', header=0) # read running apps dataframe and use it as a starting point
    df1 = df1[~df1['app'].isin(ignored_apps)]  # ignore system apps
    fix_nan_category(df1)  # fix gallery, camera...
    df1 = df1[~df1.app.str.contains("samsung|huawei|lge|asus|xiaomi|cyanogenmod")] # ignore brand apps
    df1 = df1[~df1.category.isnull()]  # ignore apps with NaN category
    df1['time'] = pd.to_datetime(df1['time'], unit='ms') # convert date from ms to datetime
    df1.sort_values('time', inplace=True)
    # df1.drop_duplicates(subset='time', keep="first", inplace=True) # drop time duplicate
    df1.reset_index(drop=True, inplace=True)
    df1.insert(1,'user',user) # insert user ID column
    
    rows = []
    for dt in tqdm(df1['time']): # foreach row in running apps dataframe find the closest row in all other dataframes using datetime
        row = []
        for filename, columns in file_dict.items(): # foreach csv file in user folder
            file_path = user_dir + '/' + filename
            row = row + get_closest_row(file_path, columns, dt).tolist() # single row with all the context features
        rows.append(row)

    df2 = pd.DataFrame(rows, columns=np.concatenate(list(file_dict.values()))) # from list of list to dataframe
    df3 = pd.concat([df1, df2], axis=1) # concat by column
    df = pd.concat([df, df3], axis=0) # concat by row
    
df.reset_index(drop=True, inplace=True)
print("done!")

  8%|██████▉                                                                            | 1/12 [00:00<00:01,  9.93it/s]

working on user 0...


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 22.43it/s]
  0%|                                                                                          | 0/918 [00:00<?, ?it/s]

working on user 1...


100%|████████████████████████████████████████████████████████████████████████████████| 918/918 [01:20<00:00, 11.35it/s]
  1%|▋                                                                                 | 3/356 [00:00<00:13, 25.44it/s]

working on user 2...


100%|████████████████████████████████████████████████████████████████████████████████| 356/356 [00:14<00:00, 25.12it/s]
  0%|                                                                                 | 2/2557 [00:00<02:33, 16.67it/s]

working on user 3...


100%|██████████████████████████████████████████████████████████████████████████████| 2557/2557 [02:40<00:00, 15.93it/s]
  0%|                                                                                 | 1/3879 [00:00<06:28,  9.97it/s]

working on user 4...


100%|██████████████████████████████████████████████████████████████████████████████| 3879/3879 [05:34<00:00, 11.60it/s]
  0%|                                                                                 | 2/2746 [00:00<03:11, 14.30it/s]

working on user 5...


100%|██████████████████████████████████████████████████████████████████████████████| 2746/2746 [03:34<00:00, 12.79it/s]
  0%|▎                                                                                 | 3/879 [00:00<00:48, 18.18it/s]

working on user 6...


100%|████████████████████████████████████████████████████████████████████████████████| 879/879 [00:44<00:00, 19.86it/s]
  0%|                                                                                 | 2/1330 [00:00<01:27, 15.17it/s]

working on user 7...


100%|██████████████████████████████████████████████████████████████████████████████| 1330/1330 [01:33<00:00, 14.27it/s]
  0%|                                                                                 | 1/2992 [00:00<05:56,  8.39it/s]

working on user 8...


100%|██████████████████████████████████████████████████████████████████████████████| 2992/2992 [06:01<00:00,  8.27it/s]
  2%|█▉                                                                                | 3/131 [00:00<00:04, 28.82it/s]

working on user 9...


100%|████████████████████████████████████████████████████████████████████████████████| 131/131 [00:04<00:00, 28.80it/s]
  0%|                                                                                 | 2/1793 [00:00<01:52, 15.95it/s]

working on user 10...


100%|██████████████████████████████████████████████████████████████████████████████| 1793/1793 [01:56<00:00, 15.41it/s]
  0%|▏                                                                                | 2/1143 [00:00<01:08, 16.59it/s]

working on user 11...


100%|██████████████████████████████████████████████████████████████████████████████| 1143/1143 [01:10<00:00, 16.14it/s]
  0%|                                                                                 | 1/4056 [00:00<08:29,  7.96it/s]

working on user 12...


100%|██████████████████████████████████████████████████████████████████████████████| 4056/4056 [07:18<00:00,  9.24it/s]
  1%|▍                                                                                 | 2/389 [00:00<00:20, 18.77it/s]

working on user 13...


100%|████████████████████████████████████████████████████████████████████████████████| 389/389 [00:18<00:00, 21.04it/s]
  1%|▍                                                                                 | 3/539 [00:00<00:24, 21.54it/s]

working on user 14...


100%|████████████████████████████████████████████████████████████████████████████████| 539/539 [00:32<00:00, 16.82it/s]
  1%|█                                                                                 | 3/240 [00:00<00:10, 22.44it/s]

working on user 15...


100%|████████████████████████████████████████████████████████████████████████████████| 240/240 [00:10<00:00, 23.48it/s]
  0%|                                                                                 | 2/1925 [00:00<02:18, 13.84it/s]

working on user 16...


100%|██████████████████████████████████████████████████████████████████████████████| 1925/1925 [02:10<00:00, 14.71it/s]
  0%|                                                                                 | 1/5624 [00:00<11:11,  8.37it/s]

working on user 17...


100%|██████████████████████████████████████████████████████████████████████████████| 5624/5624 [09:33<00:00,  9.80it/s]
  1%|▍                                                                                 | 2/365 [00:00<00:18, 19.65it/s]

working on user 18...


100%|████████████████████████████████████████████████████████████████████████████████| 365/365 [00:14<00:00, 24.38it/s]
  0%|                                                                                         | 0/1994 [00:00<?, ?it/s]

working on user 19...


100%|██████████████████████████████████████████████████████████████████████████████| 1994/1994 [02:36<00:00, 12.75it/s]
  0%|                                                                                 | 1/3218 [00:00<05:32,  9.67it/s]

working on user 20...


100%|██████████████████████████████████████████████████████████████████████████████| 3218/3218 [05:10<00:00, 10.35it/s]
  0%|▏                                                                                | 2/1039 [00:00<00:57, 18.11it/s]

working on user 21...


100%|██████████████████████████████████████████████████████████████████████████████| 1039/1039 [00:58<00:00, 17.63it/s]
  0%|▎                                                                                 | 2/448 [00:00<00:22, 19.63it/s]

working on user 22...


100%|████████████████████████████████████████████████████████████████████████████████| 448/448 [00:21<00:00, 21.03it/s]
  0%|▏                                                                                 | 2/969 [00:00<00:48, 19.94it/s]

working on user 23...


100%|████████████████████████████████████████████████████████████████████████████████| 969/969 [00:46<00:00, 20.73it/s]
  0%|                                                                                 | 1/3206 [00:00<07:49,  6.82it/s]

working on user 24...


100%|██████████████████████████████████████████████████████████████████████████████| 3206/3206 [05:53<00:00,  9.08it/s]
  0%|                                                                                 | 2/1336 [00:00<01:46, 12.47it/s]

working on user 25...


100%|██████████████████████████████████████████████████████████████████████████████| 1336/1336 [01:42<00:00, 13.07it/s]
  0%|▎                                                                                 | 3/782 [00:00<00:37, 20.50it/s]

working on user 26...


100%|████████████████████████████████████████████████████████████████████████████████| 782/782 [00:33<00:00, 23.63it/s]
  0%|                                                                                         | 0/2815 [00:00<?, ?it/s]

working on user 28...


100%|██████████████████████████████████████████████████████████████████████████████| 2815/2815 [04:17<00:00, 10.94it/s]
  0%|                                                                                 | 2/2808 [00:00<02:55, 16.01it/s]

working on user 29...


100%|██████████████████████████████████████████████████████████████████████████████| 2808/2808 [02:52<00:00, 16.29it/s]
  0%|                                                                                         | 0/2529 [00:00<?, ?it/s]

working on user 30...


100%|██████████████████████████████████████████████████████████████████████████████| 2529/2529 [04:02<00:00, 10.41it/s]

done!





In [60]:
df.to_csv('MDF_not_encoded.csv', index=False)

df = pd.read_csv('MDF_not_encoded.csv')
df['time'] = pd.to_datetime(df['time'])

## Extract new features

In [None]:
def daytime_from_date(date):
    hour = date.hour
    if hour >= 5 and hour <= 12:
        return 'morning'
    elif hour >= 13 and hour <= 18:
        return 'afternoon'
    elif hour >= 19 and hour <= 22:
        return 'evening'
    else:
        return 'night'
    
def weekday_from_date(date):
    return date.strftime("%A")

def is_weekend(weekday:str):
    return True if weekday == 'Saturday' or weekday == 'Sunday' else False

it_holidays = holidays.Italy()

def is_holiday(date):
    return date in it_holidays

In [None]:
df['daytime'] = df['time'].apply(daytime_from_date)
df['weekday'] = df['time'].apply(weekday_from_date)
df['is_weekend'] = df['weekday'].apply(is_weekend)
df['is_holiday'] = df['time'].apply(is_holiday)

## Encoding
### Fix labels

**place type**: group similar labels under a more general labels (es. food, restaurant and bar under food label)

In [62]:
df.loc[df['place_type'].isin(['restaurant', 'bar', 'cafe', 'food']), 'place_type'] = 'food_and_drink'
df.loc[df['place_type'].isin(['route', 'street', 'park', 'tourist_attraction']), 'place_type'] = 'outdoors'
df.loc[df['place_type'].isin(['transit_station', 'bus_station', 'taxi_stand']), 'place_type'] = 'public_transport_station'
df.loc[df['place_type'].isin(['supermarket', 'home_goods', 'bakery', 'shopping_mall', 'library', 'book_store', 'florist']), 'place_type'] = 'store'
df.loc[df['place_type'].isin(['health', 'doctor']), 'place_type'] = 'health'
df.loc[df['place_type'].isin(['finance', 'gas_station', 'general_contractor', 'bank', 'premise', 'lawyer', 'insurance_agency', 'hair_care', 'city_hall', 'plumber', 'pharmacy', 'police', 'veterinary', 'laundry', 'place_of_worship', 'university', 'moving_company', 'post_office', 'car_repair', 'real_estate_agency', 'painter', 'car_wash', 'local_government_office', 'beauty_salon', 'electrician', 'car_rental', 'funeral_home', 'fire_station', 'travel_agency']), 'place_type'] = 'service'

**category**: group all GAME subcategories under GAME label

In [63]:
df.loc[df['category'].str.contains('GAME'), 'category'] = 'GAME'
df.loc[df['category'].isin([' COMMUNICATION']), 'category'] = 'COMMUNICATION' # fix communication category with space at the beginning

### App
Convert **app** from package name to unique IDs and rename to item

In [64]:
df.app = pd.factorize(df.app)[0]
df = df.rename(columns={'app': 'item'})

### Category
**Category** is one hot encoded

In [None]:
df = pd.concat((df, pd.get_dummies(df['category'], prefix='category')), axis=1)
df.pop('category')

### Activities
**in_vehicle, on_bicycle, on_foot, running, still, tilting, walking, unknown** represent the probability from 0 to 100 that the user is doing an activity. These features are normalized

In [None]:
activities = 'in_vehicle on_bicycle on_foot running still tilting walking unknown'.split()
df[activities] = df[activities].apply(lambda x: x/100)

### Volume
- **ringer_mode** is one hot encoded
- **alarm_volume, music_volume, notifications_volume, ring_volume, music_active, speaker_on, headset_connected** are already normalized

In [None]:
df = pd.concat((df, pd.get_dummies(df['ringer_mode'], prefix='ringer_mode')), axis=1)
df.pop('ringer_mode')

### Battery
- Battery **level** goes from 0 to 1, where 1 is full charged, it is converted to a categorical variable and then one-hot encoded
- **charging** is boolean

In [None]:
def get_battery_status(lv):
    lv = lv * 100
    if lv >= 80:
        return 'charged'
    elif lv >= 60 and lv < 80:
        return 'quite charged'
    elif lv >= 40 and lv < 60:
        return 'half charged'
    elif lv >= 20 and lv < 40:
        return 'low'
    else:
        return 'very low'

df['level'] = df['level'].apply(get_battery_status)
df = pd.concat((df, pd.get_dummies(df['level'], prefix='battery')), axis=1)
df.pop('level')

### Display
- **state** can be 1,2,3,4
- **rotation** can be 0,1,3

Both variables are one hot encoded

In [None]:
df = pd.concat((df, pd.get_dummies(df['state'], prefix='display_state')), axis=1)
df = pd.concat((df, pd.get_dummies(df['rotation'], prefix='display_rotation')), axis=1)
df.pop('state')
df.pop('rotation')

### Weather
- **temp**, **humidity, pressure, wind_speed, wind_deg**, **clouds** are normalized
- **rain_last3h** is transformed into a boolean

In [None]:
df['rain_last3h'] = df['rain_last3h'].apply(lambda x: 1 if x > 0 else 0) # true if it rained

cols_to_norm = 'temp humidity pressure wind_speed wind_deg clouds'.split()
df[cols_to_norm] = MinMaxScaler().fit_transform(df[cols_to_norm])

### Place and date
**place_type, daytime, weekday** are one hot encoded

In [None]:
cols = 'place_type daytime weekday'.split()
for e in cols:
    df = pd.concat((df, pd.get_dummies(df[e], prefix=e)), axis=1)
    df.pop(e)

### Boolean to int

In [None]:
for col in 'music_active speaker_on headset_connected connected is_weekend is_holiday'.split():
    x[col] = x[col].astype(int)

### Add rating

In [65]:
df.pop('time')
df['rating'] = 1

## Negative sampling

In [66]:
neg_df = pd.DataFrame(columns=df.columns)

all_labels = df.label.unique() # all possible context of a single user

items_labels = {} # dictionary that contains in which contexts an item has been used
for item in df.item.unique():
    items_labels[item] = df[df.item == item]['label'].unique()
    
for index, row in df.iterrows():
    item = row['item']
    pos_labels = items_labels[item]  # contexts in which an item has been used
    neg_labels = list(set(all_labels) - set(pos_labels))  # contexts in which an item has NOT been used
    for neg in neg_labels: # generate a new negative sample foreach negative label
        neg_context = df.loc[(df.item != item) & (df.label == neg)].sample(n=1) # take a random item with negative context
        neg_context = neg_context.iloc[:, 4:] # keep only the context
        item_row = pd.DataFrame(row.iloc[0:4]).transpose() # take user, item, rating
        item_row.reset_index(drop=True, inplace=True) # reset index for concat
        neg_context.reset_index(drop=True, inplace=True)
        neg_row = pd.concat([item_row, neg_context], axis=1)
        neg_row.rating = 0
        neg_df = neg_df.append(neg_row)   

df = df.append(neg_df)
df.sort_values(by=['user'], inplace=True)
df.reset_index(drop=True, inplace=True)
# df.pop('label')

In [67]:
df.to_csv('MDF_not_encoded_neg_sampled.csv')

## Save to CSV

In [None]:
df.to_csv('MDF_final.csv', index=False)