In [64]:
from spmf import Spmf
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import ast
pd.options.plotting.backend = "plotly"
import plotly.express as px
# from chart_studio import plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.subplots as sp
pd.options.display.max_colwidth = None
pd.set_option('display.max_rows', 70)

In [65]:
data=pd.read_csv("sorted_coded_session_293s_merged.csv")

# keep 90% coverage of data

In [66]:
cumulated_time_spend=data.groupby(["user_id","app_code"]).agg({'time_spend':'sum'})['time_spend'].groupby('user_id', group_keys=False).apply(lambda x: x.sort_values(ascending=False).cumsum()).reset_index()

In [67]:
total_time_spend=data.groupby('user_id').time_spend.sum()

In [68]:
cumulated_time_spend=cumulated_time_spend.join(total_time_spend,on='user_id',rsuffix="_total")

In [69]:
cumulated_time_spend['coverage']=cumulated_time_spend.time_spend/cumulated_time_spend.time_spend_total

In [70]:
cumulated_time_spend=cumulated_time_spend[cumulated_time_spend.coverage.shift(1)<=0.50]

In [71]:
data_coverage=data.merge(cumulated_time_spend,how='inner',on=['user_id','app_code'])

As a sanity check we take a look at how many apps are left for the each user after the apps were removed.

In [72]:
(data_coverage.groupby('user_id').app_code.nunique()==1).sum()

3349

In [73]:
data_coverage.groupby('user_id').app_code.nunique()[data_coverage.groupby('user_id').app_code.nunique()<5]

user_id
0       2
1       2
2       1
3       2
4       3
       ..
9994    3
9996    3
9997    1
9998    2
9999    3
Name: app_code, Length: 8802, dtype: int64

In [74]:
print("Number of observation after removing apps:",data_coverage.shape[0])
print("Number of different apps left:",data_coverage.app_code.nunique())

Number of observation after removing apps: 49471272
Number of different apps left: 1139


Initially this results in 11 users only having 1 app left and 3469 users having less than 10 different apps. Therefore the coverage is increased form 90% to 95%. Here 1015 users have less than 10 different apps. for 99% 89 users have less than 10 apps. Only 7 have less than 5 and only one user has three apps, and no user has less than 3 different apps.

## update session length

In [75]:
data_coverage=data_coverage.merge(data_coverage.groupby(['user_id','session']).agg({'app_code':'size'}).rename(columns={'app_code':'session_length'}).reset_index(),on=['user_id','session'])

We are now ready to find the number of observations.

In [76]:
input_string=data_coverage[data_coverage.session_length>1].groupby(['user_id','session'])['app_code'].apply(list).apply(lambda l: ' -1 '.join([str(i) for i in l])+" -1 -2")

In [77]:
input_string.to_csv("input_string_50_coverage.csv",index='False')

In [78]:
input_string=pd.read_csv("input_string_50_coverage.csv")

In [79]:
input_string=input_string.drop(['user_id','session'],axis=1)

In [80]:
dfAsString = input_string.to_string(header=False, index=False).lstrip()

In [81]:
dfAsString=re.sub(r'\n\s+','\n',dfAsString)

In [82]:
#update file name
f = open(os.path.join(os.getcwd(), '293sec_sessions_50_coverage_without_length_1.txt'), 'w')
f.write(dfAsString)
f.close()

In [84]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/293sec_sessions_50_coverage_without_length_1.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_293sec_sessions_50_coverage_without_length_1_min_sup_0_75.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0075])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_293sec_sessions_50_coverage_without_length_1_min_sup_0_75.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 53523 ms
 Frequent sequences count : 33
 Max memory (mb) : 7283.82916259765633
minsup 68548
Intersection count 115 


                                       pattern     sup
0                                      [53996]   94640
1                                      [45846]  116245
2                                      [36529]  107114
3                                      [23947]  142963
4                                      [23906]  125357
5                                      [18676]   70192
6                                       [2259]  111140
7                               [45329, 45329]   74676
8                               [53003, 27441]   91982
9                               [53003, 19968]   82754
10                              [19968, 53003]  122173
11                               [53003, 7649]  141203
12                               [45264, 7649]   74505
13                              [23967

In [86]:
app_dict=dict(zip(data.app_code, data.package_name))

In [87]:
output=pd.read_csv("output_VMSP_293sec_sessions_50_coverage_without_length_1_min_sup_0_75.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_293sec_sessions_50_coverage_without_length_1_min_sup_0_75.csv",index=False)

In [89]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 7
Max lengths of frequent patterns: 6


In [90]:
output

Unnamed: 0,pattern,sup,pattern_length
13,"[com.google.android.youtube, com.google.android.youtube]",187659,2
21,"[com.facebook.katana, com.android.chrome]",152861,2
3,[com.google.android.gm],142963,1
11,"[com.whatsapp, com.android.chrome]",141203,2
19,"[com.google.android.googlequicksearchbox, com.android.chrome]",131717,2
4,[com.google.android.apps.maps],125357,1
10,"[com.facebook.katana, com.whatsapp]",122173,2
25,"[com.google.android.googlequicksearchbox, com.google.android.googlequicksearchbox, com.google.android.googlequicksearchbox]",121632,3
1,[com.spotify.music],116245,1
15,"[com.facebook.katana, com.instagram.android]",113516,2
