In [1]:
from spmf import Spmf
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import ast
pd.options.plotting.backend = "plotly"
import plotly.express as px
# from chart_studio import plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.subplots as sp
pd.options.display.max_colwidth = None
pd.set_option('display.max_rows', 70)
from pandas.plotting import table 
import nltk
import seaborn as sns
%matplotlib inline
import random

In [2]:
data=pd.read_csv("sorted_coded_session_293s_merged.csv")

# Removing infrequent apps.
Infrequent apps are defined as those which make up less than 0.01 percent of the total number of observations. These are removed in this section.

In [3]:
data

Unnamed: 0,user_id,package_name,start_time,end_time,app_code,time_spend,sequence_number,sequence_Start_time,sequence_end_time,sleep_time,new_session,session
0,0,com.facebook.katana,0,441,19968,441,1,0,441,0,False,0
1,0,com.android.chrome,441,1012,7649,571,2,441,1012,0,False,0
2,0,com.google.android.apps.photos,1012,1032,23916,20,3,1012,1032,0,False,0
3,0,com.facebook.katana,1042,1502,19968,460,4,1042,1502,10,False,0
4,0,com.whatsapp,1512,1572,53003,60,5,1512,1572,10,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...
228760320,9999,com.google.android.googlequicksearchbox,29207214,29207244,23949,30,14901,29207214,29207244,185,False,5646
228760321,9999,com.android.chrome,29207244,29207284,7649,40,14902,29207244,29207284,0,False,5646
228760322,9999,com.google.android.apps.docs,29207284,29207294,23875,10,14903,29207284,29207294,0,False,5646
228760323,9999,com.microsoft.office.outlook,29213184,29213213,33357,29,14904,29213184,29213213,5890,True,5647


In [4]:
app_count=data.groupby('app_code').size()
app_count=app_count.sort_values(ascending=False)

In [8]:
infrequent_apps=app_count[app_count/data.shape[0]*100<0.01].index

In [10]:
print("number of infrequent apps for 0.01 thresshold:",len(infrequent_apps))

number of infrequent apps for 0.01 thresshold: 66757


In [16]:
print("number of frequent apps for 0.01 thresshold:",data.app_code.nunique()-len(infrequent_apps))

number of frequent apps for 0.01 thresshold: 331


In [14]:
data["infrequent_app"]=data.app_code.apply(lambda l: l in infrequent_apps)

In [14]:
data_sessions["infrequent_app"]=data_sessions.app_code.apply(lambda l: l in infrequent_apps)

In [17]:
data=data[data.infrequent_app!=True]

In [20]:
print("Number of observations after infrequent apps have been removed:",data.shape[0])

Number of observations after infrequent apps have been removed: 214416409


We now remove sessions which only contains one app.

In [25]:
print("Number of sessons including sessions only containing one app:",data.groupby('user_id').session.nunique().sum())

Number of sessons including sessions only containing one app: 74783651


In [27]:
data=pd.merge(data,(data.groupby(['user_id','session']).agg({'app_code':'size'})).reset_index().rename(columns={'app_code':'session_length_updated'}),how='left',on=['user_id','session'])

In [31]:
data=data[data.session_length_updated>1]

In [33]:
print("number of observations:",data.shape[0])

number of observations: 182182295


# Preparing the VMSP input
Now that the infrequent apps have been removed, along with session only containing one app, we are ready to construct the text file input used by the VMSP algorithm.

In [36]:
input_string=data.groupby(['user_id','session'])['app_code'].apply(list).apply(lambda l: ' -1 '.join([str(i) for i in l])+" -1 -2")

In [37]:
np.savetxt('293sec_sessions_0_01_removed_without_split.txt', input_string.values, fmt='%s')

In [38]:
print("number of updated sessions:",input_string.shape[0])

number of updated sessions: 42549537


In [40]:
print("Number of sessions needed for min. sup. of 0.01 percent:",np.ceil(input_string.shape[0]/100*0.1))

Number of sessions needed for min. sup. of 0.01 percent: 4255.0


# VMSP
Now the VMSP algorithm is run, with a minimum support of 0.1 percent.

In [41]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/293sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_2293sec_sessions_0_01_removed_without_split_min_sup_0_1.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.001])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_293sec_sessions_0_01_removed_without_split_min_sup_0_1.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 7326463 ms
 Frequent sequences count : 7378
 Max memory (mb) : 20706.6906051635747378
minsup 42550
Intersection count 68808 


                                                                            pattern  \
0                                                                           [66375]   
1                                                                           [65996]   
2                                                                           [65957]   
3                                                                           [65471]   
4                                                                           [65457]   
...                                                                             ...   
7373         [19968, 53003, 19968, 53003, 19968, 53003, 19968, 19968, 53003, 19968]   
7374         [53003, 19968, 53003, 19968, 53003, 19968, 53003, 19968, 19968, 19968]   
7375         [19

## Translating the result

In [42]:
app_dict=dict(zip(data.app_code, data.package_name))

output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_1.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_1.csv",index=False)

In [43]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_1.csv")

print("number of length one patterns:",output[output.pattern_length==1].shape[0])
print("longest pattern:",output.pattern_length.max())

number of length one patterns: 12
longest pattern: 3
