In [64]:
from spmf import Spmf
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import ast
pd.options.plotting.backend = "plotly"
import plotly.express as px
# from chart_studio import plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.subplots as sp
pd.options.display.max_colwidth = None
pd.set_option('display.max_rows', 70)

In [65]:
data=pd.read_csv("sorted_coded_session_293s_merged.csv")

# keep coverage of data

In [189]:
cumulated_time_spend=data.groupby(["user_id","app_code"]).agg({'time_spend':'sum'})['time_spend'].groupby('user_id', group_keys=False).apply(lambda x: x.sort_values(ascending=False).cumsum()).reset_index()

In [190]:
total_time_spend=data.groupby('user_id').time_spend.sum()

In [191]:
cumulated_time_spend=cumulated_time_spend.join(total_time_spend,on='user_id',rsuffix="_total")

In [192]:
cumulated_time_spend['coverage']=cumulated_time_spend.time_spend/cumulated_time_spend.time_spend_total

In [193]:
cumulated_time_spend=cumulated_time_spend[cumulated_time_spend.coverage.shift(1)<=0.70]

In [194]:
data_coverage=data.merge(cumulated_time_spend,how='inner',on=['user_id','app_code'])

As a sanity check we take a look at how many apps are left for the each user after the apps were removed.

In [195]:
(data_coverage.groupby('user_id').app_code.nunique()==1).sum()

527

In [196]:
data_coverage.groupby('user_id').app_code.nunique()[data_coverage.groupby('user_id').app_code.nunique()<5]

user_id
0       4
2       2
3       3
5       3
6       3
       ..
9992    2
9993    3
9994    4
9995    2
9997    2
Name: app_code, Length: 5707, dtype: int64

In [197]:
print("Number of observation after removing apps:",data_coverage.shape[0])
print("Number of different apps left:",data_coverage.app_code.nunique())

Number of observation after removing apps: 90303962
Number of different apps left: 2268


Initially this results in 11 users only having 1 app left and 3469 users having less than 10 different apps. Therefore the coverage is increased form 90% to 95%. Here 1015 users have less than 10 different apps. for 99% 89 users have less than 10 apps. Only 7 have less than 5 and only one user has three apps, and no user has less than 3 different apps.

## update session length

In [198]:
data_coverage=data_coverage.merge(data_coverage.groupby(['user_id','session']).agg({'app_code':'size'}).rename(columns={'app_code':'session_length'}).reset_index(),on=['user_id','session'])

We are now ready to find the number of observations.

In [199]:
input_string=data_coverage[data_coverage.session_length>1].groupby(['user_id','session'])['app_code'].apply(list).apply(lambda l: ' -1 '.join([str(i) for i in l])+" -1 -2")

In [200]:
input_string.to_csv("input_string_70_coverage.csv",index='False')

In [201]:
input_string=pd.read_csv("input_string_70_coverage.csv")

In [202]:
input_string=input_string.drop(['user_id','session'],axis=1)

In [203]:
dfAsString = input_string.to_string(header=False, index=False).lstrip()

In [204]:
dfAsString=re.sub(r'\n\s+','\n',dfAsString)

In [205]:
#update file name
f = open(os.path.join(os.getcwd(), '293sec_sessions_70_coverage_without_length_1.txt'), 'w')
f.write(dfAsString)
f.close()

In [206]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/293sec_sessions_70_coverage_without_length_1.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_293sec_sessions_70_coverage_without_length_1_min_sup_0_75.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0075])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_293sec_sessions_70_coverage_without_length_1_min_sup_0_75.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 198252 ms
 Frequent sequences count : 68
 Max memory (mb) : 22900.25807189941468
minsup 136450
Intersection count 332 


                                pattern     sup
0                               [53996]  374685
1                               [45261]  209142
2                               [45260]  159030
3                               [36529]  277496
4                               [33357]  319240
5                               [23909]  193550
6                               [23906]  462013
7                               [18676]  249771
8                                [2259]  342373
9                        [50519, 50519]  337960
10                       [53003, 45264]  220910
11                       [45264, 53003]  290301
12                       [53003, 44880]  150934
13                       [44880, 53003]  136774
14                       [45846, 45846]  156208
15                       [45329, 4532

In [86]:
app_dict=dict(zip(data.app_code, data.package_name))

In [207]:
output=pd.read_csv("output_VMSP_293sec_sessions_70_coverage_without_length_1_min_sup_0_75.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_293sec_sessions_70_coverage_without_length_1_min_sup_0_75.csv",index=False)

In [208]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 9
Max lengths of frequent patterns: 5


In [209]:
output

Unnamed: 0,pattern,sup,pattern_length
6,[com.google.android.apps.maps],462013,1
0,[com.yahoo.mobile.client.android.mail],374685,1
8,[bbc.mobile.news.uk],342373,1
9,"[com.twitter.android, com.twitter.android]",337960,2
39,"[com.facebook.katana, com.instagram.android]",330961,2
4,[com.microsoft.office.outlook],319240,1
50,"[com.android.chrome, com.facebook.katana]",317767,2
21,"[com.facebook.katana, com.whatsapp]",316425,2
36,"[com.google.android.youtube, com.google.android.youtube]",300889,2
45,"[com.android.chrome, com.instagram.android]",295732,2


With the overall perspective and the low coverage it is not suprising that it are the well known apps that we see here. it also turns out that we end up with a lot of repeating apps.Since repaeting apps were merged apps were removed based on coverage. It indicates that for instane com.google.android.youtube -> x -> com.google.android.youtube is a common pattern. It is also clear to se that apps where you spend a lot of time are generally favoured. For instance the most common pattern is youtube->youtube which is an app were people could be expected to use more time.

In [91]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_test.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0075])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_test.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 589600 ms
 Frequent sequences count : 107
 Max memory (mb) : 14170.760398864746107
minsup 396290
Intersection count 580 


                  pattern      sup
0                 [65472]   420464
1                 [53996]  1065566
2                 [49042]   523059
3                 [45846]  1581664
4                 [45327]   454480
..                    ...      ...
102  [19968, 7649, 19968]   752957
103   [7649, 23949, 7649]   590476
104   [23949, 7649, 7649]   521353
105   [7649, 19968, 7649]   442857
106    [7649, 7649, 7649]   415624

[107 rows x 2 columns]


With the time coverage a lot more observations need to be removed. Therefore we will instead try to use the coverage of the observations so as to for instance keep 90% of the users observations.
This does however have the disadvantage that we risk more scars observations within a session after they are removed.

In [151]:
app_occurences=data.groupby(["user_id","app_code"]).agg({'package_name':'size'})['package_name'].groupby('user_id', group_keys=False).apply(lambda x: x.sort_values(ascending=False).cumsum()).reset_index()

In [152]:
total_apps=data.groupby('user_id').package_name.size()

In [153]:
app_occurences=app_occurences.join(total_apps,on='user_id',rsuffix="_total")

In [154]:
app_occurences['coverage']=app_occurences.package_name/app_occurences.package_name_total

In [155]:
app_occurences=app_occurences[app_occurences.coverage.shift(1)<=0.75]

In [156]:
data_coverage=data.merge(app_occurences,how='inner',on=['user_id','app_code'])

In [157]:
(data_coverage.groupby('user_id').app_code.nunique()==1).sum()

20

In [158]:
print("Number of observation after removing apps:",data_coverage.shape[0])
print("Number of different apps left:",data_coverage.app_code.nunique())

Number of observation after removing apps: 121813869
Number of different apps left: 2469


In [159]:
data_coverage=data_coverage.merge(data_coverage.groupby(['user_id','session']).agg({'app_code':'size'}).rename(columns={'app_code':'session_length'}).reset_index(),on=['user_id','session'])

In [160]:
input_string=data_coverage[data_coverage.session_length>1].groupby(['user_id','session'])['app_code'].apply(list).apply(lambda l: ' -1 '.join([str(i) for i in l])+" -1 -2")

In [161]:
input_string.to_csv("input_string_75_observation_coverage.csv",index='False')

In [162]:
input_string=pd.read_csv("input_string_75_observation_coverage.csv")

In [163]:
input_string=input_string.drop(['user_id','session'],axis=1)

In [164]:
dfAsString = input_string.to_string(header=False, index=False).lstrip()

In [165]:
dfAsString=re.sub(r'\n\s+','\n',dfAsString)

In [166]:
#update file name
f = open(os.path.join(os.getcwd(), '293sec_sessions_75_observation_coverage_without_length_1.txt'), 'w')
f.write(dfAsString)
f.close()

In [167]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/293sec_sessions_75_observation_coverage_without_length_1.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_293sec_sessions_75_observation_coverage_without_length_1_min_sup_0_75.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0075])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_293sec_sessions_75_observation_coverage_without_length_1_min_sup_0_75.csv")

CalledProcessError: Command '['java', '-jar', '/home/s164574/motifs_and_competition_in_app_usage/spmf.jar', 'run', 'VMSP', '/home/s164574/motifs_and_competition_in_app_usage/293sec_sessions_75_observation_coverage_without_length_1.txt', '/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_293sec_sessions_75_observation_coverage_without_length_1_min_sup_0_75.txt', '0.0075']' returned non-zero exit status 1.