In [2]:
from spmf import Spmf
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import ast
pd.options.plotting.backend = "plotly"
import plotly.express as px
# from chart_studio import plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.subplots as sp
pd.options.display.max_colwidth = None
pd.set_option('display.max_rows', 70)

Initially the full data set with sleep times are read.

In [3]:
data= pd.read_csv("sorted_coded_sleep_time.csv")

## Defining sessions with 293 seconds sleep threshold

In [5]:
data['new_session'] = data.groupby(['user_id']).sleep_time.apply(lambda s: s>293)
data['session'] = data.groupby(['user_id']).new_session.cumsum().astype('int')

## Merge repeating apps within same session

In [6]:
data["previous_app_code"]=data.groupby(["user_id","session"]).app_code.shift(1)
data2=data.copy()
data2=data2[(data2.previous_app_code==data2.app_code) | (data2.app_code==data2.groupby(['user_id','session']).app_code.shift(-1))]
data2['new_event'] = data2.previous_app_code!=data2.app_code
data2['event'] = data2.groupby(['user_id','session']).new_event.cumsum().astype('int')

Finding the index of the max end time for each event, where event is used to refere to a group of identical apps following right after each other in the use sequence, in the same session.

In [7]:
data2_grouped = data2.groupby(['user_id', 'session','event']).agg({'end_time':'max'})

data2_grouped = data2_grouped.reset_index()

data2_grouped = data2_grouped.rename(columns={'end_time':'event_end_time'})

data3 = pd.merge(data2, data2_grouped, how='left', on=['user_id', 'session','event'])

data3 = data3[data3['end_time'] == data3['event_end_time']]

#because in the same event more apps can be closed at once.
data3=data3.drop_duplicates(subset=['user_id', 'session','event'], keep='last')
max_index=data2.iloc[data3.index].index

Similarly finding the index of the min start time for each event.

In [8]:
data2_grouped = data2.groupby(['user_id', 'session','event']).agg({'start_time':'min'})

data2_grouped = data2_grouped.reset_index()

data2_grouped = data2_grouped.rename(columns={'start_time':'event_start_time'})

data3 = pd.merge(data2, data2_grouped, how='left', on=['user_id', 'session','event'])

data3 = data3[data3['start_time'] == data3['event_start_time']]

#because in the same event more apps can be opened at once.
data3=data3.drop_duplicates(subset=['user_id', 'session','event'], keep='last')
min_index=data2.iloc[data3.index].index

Now the observations can be merged.

In [9]:
data3=data.copy()
data3.loc[max_index,"start_time"]=data3.loc[min_index].set_index(max_index).start_time
data3=data3.drop(set(data2.index)-set(max_index))
len(data.index)-(len(data2.index)-len(max_index))==len(data3.index)

True

At last the different measures can be updated.

In [10]:
data3['time_spend']=data3['end_time']-data3['start_time']
data3=data3.drop('previous_app_code',axis=1)
data3['sleep_time']=data3.start_time-data3.sequence_end_time.shift(1,fill_value=0)
data3.loc[data3[data3['sleep_time']<0].index,'sleep_time']=0
data3=data3.reset_index(drop=True)

In [11]:
data3.to_csv("sorted_coded_session_293s_merged.csv",index=False)

In [5]:
data3=pd.read_csv("sorted_coded_session_293s_merged.csv")

MemoryError: Unable to allocate 1.70 GiB for an array with shape (228760319,) and data type int64

## Removing infrequnet apps

### Calculating individual infrequency
In this section calculating the infrequency is done based on applying an infrequency threshold for each user.

The first step is to get the individual app counts.

In [None]:
app_count=data3.groupby(["user_id","app_code"]).agg({'package_name':'size'}).rename(columns={'package_name':'number_of_app_occurences'}).reset_index()

In [None]:
test=data3.groupby(["user_id"]).agg({'package_name':'size'}).rename(columns={'package_name':'number_of_observations'}).reset_index()

In [None]:
app_count=pd.merge(app_count, test, how='left', on=['user_id'])

In [None]:
app_count.to_csv("app_count_individual.csv".index=False)

In [None]:
frequency_threshold=0.05

In [None]:
infrequent_apps=app_count[app_count['number_of_app_occurences']/app_count['number_of_observations']*100<frequency_threshold].groupby('user_id').app_code.unique()

In [None]:
data3["infrequent_app"]=data3.apply(lambda l: l.app_code in infrequent_apps[l.user_id] if l.user_id in infrequent_apps.index else False,axis=1 )

In [None]:
data3.to_csv("sorted_coded_session_293s_merged_infrequent_0_05_apps_calculated.csv",index=False)

Now that it is determined if each app is infrequent, the infrequent apps can be removed. Here they are removed directly without changing the session.

In [12]:
data3=data3[data3.infrequent_app.apply(lambda l: not l)]

In [13]:
data3.to_csv("sorted_coded_session_293s_merged_infrequent_0_01_apps_removed.csv",index=False)

We can now take away any sessions of length 1.

In [14]:
data3=pd.merge(data3,(data3.groupby(['user_id','session']).agg({'app_code':'size'})).reset_index().rename(columns={'app_code':'session_length'}),how='left',on=['user_id','session'])

In [15]:
data3.to_csv("sorted_coded_session_293s_infrequent_0_01_apps_removed_with_length.csv",index=False)

In [19]:
print("number of observation",data3.shape[0])

number of observation 228418079


In [21]:
print("number of observation form sessions with more than one app",data3[data3.session_length>1].shape[0])

number of observation form sessions with more than one app 196524065


In [24]:
data3=data3[data3.session_length>1]

In [25]:
data3.to_csv("sorted_coded_session_293s_infrequent_0_01_apps_removed_without_length1.csv",index=False)

## Creating test file

In [26]:
input_string=data3.groupby(['user_id','session'])['app_code'].apply(list).apply(lambda l: ' -1 '.join([str(i) for i in l])+" -1 -2")

In [27]:
input_string.to_csv("input_string_293s_individual_infrequency_0_01.csv",index=False)

In [3]:
input_string=pd.read_csv("input_string_293s_individual_infrequency_0_01.csv")

In [4]:
dfAsString = input_string.to_string(header=False, index=False).lstrip()

MemoryError: 

In [None]:
dfAsString=re.sub(r'\n\s+','\n',dfAsString)
#update file name
f = open(os.path.join(os.getcwd(), '293sec_sessions_individual_infrequency_0_01.txt'), 'w')
f.write(dfAsString)
f.close()

## Testing with VMSP

In [None]:
# A test using there package 
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/293sec_sessions_individual_infrequency_0_01.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_293sec_sessions_individual_infrequency_0_01_min_sup_1.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.01])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_293sec_sessions_individual_infrequency_0_01_min_sup_1.csv")

In [None]:
data = pd.read_csv('sorted_coded_data.csv')
app_dict=dict(zip(data.app_code, data.package_name))

In [None]:
output=pd.read_csv("output_VMSP_293sec_sessions_individual_infrequency_0_01_min_sup_1.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_293sec_sessions_individual_infrequency_0_01_min_sup_1_translated.csv",index=False)

In [None]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())