# Document description
In this notebook we run the VMSP algorithm with different minimum supports, to be able to tune the minimum support.

# Import

In [3]:
from spmf import Spmf
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import ast
pd.options.plotting.backend = "plotly"
import plotly.express as px
from chart_studio import plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.subplots as sp
pd.options.display.max_colwidth = None
pd.set_option('display.max_rows', 70)

# Tests

In [8]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_1.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.001])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_1.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 2700763 ms
 Frequent sequences count : 1123
 Max memory (mb) : 16597.8954467773441123
minsup 52643
Intersection count 11215 


                                       pattern     sup
0                                      [66375]   82713
1                                      [66136]  135309
2                                      [65957]  100274
3                                      [65471]  128913
4                                      [65457]   59863
...                                        ...     ...
1118  [23949, 7649, 23949, 23949, 7649, 23949]   66174
1119  [23949, 23949, 7649, 23949, 7649, 23949]   52815
1120  [23949, 7649, 23949, 7649, 23949, 23949]   56967
1121   [23949, 7649, 23949, 7649, 23949, 7649]   79597
1122   [7649, 23949, 7649, 23949, 7649, 23949]   57362

[1123 rows x 2 columns]


To be able to interpret the discovered patterns the app codes are translated to their corresponding package names

In [9]:
data = pd.read_csv('sorted_coded_data.csv')
app_dict=dict(zip(data.app_code, data.package_name))

In [10]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_1.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_1_translated.csv",index=False)

In [11]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 94
Max lengths of frequent patterns: 6


In [12]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_1.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.01])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_1.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 515163 ms
 Frequent sequences count : 80
 Max memory (mb) : 14272.13662719726680
minsup 526429
Intersection count 403 


                 pattern      sup
0                [53996]  1074492
1                [45846]  1590487
2                [45284]   860977
3                [45246]   783659
4                [36529]   787773
..                   ...      ...
75  [23949, 7649, 23949]  1194735
76  [19968, 23949, 7649]   637146
77  [19968, 7649, 19968]   810573
78   [7649, 23949, 7649]   627769
79   [23949, 7649, 7649]   560004

[80 rows x 2 columns]


In [13]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_1.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_1_translated.csv",index=False)

In [14]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 12
Max lengths of frequent patterns: 3


In [15]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_95.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0095])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_95.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 543618 ms
 Frequent sequences count : 84
 Max memory (mb) : 14233.91006469726684
minsup 500108
Intersection count 415 


                 pattern      sup
0                [53996]  1074492
1                [49042]   523172
2                [45846]  1590487
3                [45284]   860977
4                [45246]   783659
..                   ...      ...
79  [19968, 23949, 7649]   637146
80  [23949, 7649, 19968]   519576
81  [19968, 7649, 19968]   810573
82   [23949, 7649, 7649]   560004
83   [7649, 23949, 7649]   627769

[84 rows x 2 columns]


In [16]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_95.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_95_translated.csv",index=False)

In [17]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 14
Max lengths of frequent patterns: 3


In [18]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_9.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.009])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_9.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 511408 ms
 Frequent sequences count : 90
 Max memory (mb) : 14192.91181182861390
minsup 473786
Intersection count 434 


                 pattern      sup
0                [53996]  1074492
1                [49042]   523172
2                [45846]  1590487
3                [45284]   860977
4                [45246]   783659
..                   ...      ...
85  [19968, 7649, 19968]   810573
86   [23949, 7649, 7649]   560004
87   [7649, 23949, 7649]   627769
88   [7649, 19968, 7649]   478989
89    [7649, 7649, 7649]   480170

[90 rows x 2 columns]


In [19]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_9.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_9_translated.csv",index=False)

In [20]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 15
Max lengths of frequent patterns: 3


In [21]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_85.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0085])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_85.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 553603 ms
 Frequent sequences count : 95
 Max memory (mb) : 14185.15042877197395
minsup 447465
Intersection count 488 


                 pattern      sup
0                [53996]  1074492
1                [49042]   523172
2                [45846]  1590487
3                [45327]   460532
4                [45284]   860977
..                   ...      ...
90  [19968, 7649, 19968]   810573
91   [7649, 23949, 7649]   627769
92   [23949, 7649, 7649]   560004
93   [7649, 19968, 7649]   478989
94    [7649, 7649, 7649]   480170

[95 rows x 2 columns]


In [22]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_85.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_85_translated.csv",index=False)

In [23]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 15
Max lengths of frequent patterns: 3


In [24]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_8.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.008])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_8.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 598427 ms
 Frequent sequences count : 108
 Max memory (mb) : 14273.31216430664108
minsup 421143
Intersection count 588 


                 pattern      sup
0                [65472]   427039
1                [53996]  1074492
2                [49042]   523172
3                [45846]  1590487
4                [45327]   460532
..                   ...      ...
103  [23949, 7649, 7649]   560004
104  [7649, 23949, 7649]   627769
105  [19968, 7649, 7649]   427342
106  [7649, 19968, 7649]   478989
107   [7649, 7649, 7649]   480170

[108 rows x 2 columns]


In [25]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_8.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_8_translated.csv",index=False)

In [26]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 17
Max lengths of frequent patterns: 3


In [27]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_75.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0075])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_75.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 671259 ms
 Frequent sequences count : 115
 Max memory (mb) : 16200.174240112305115
minsup 394822
Intersection count 663 


                 pattern      sup
0                [65472]   427039
1                [53996]  1074492
2                [49042]   523172
3                [45846]  1590487
4                [45327]   460532
..                   ...      ...
110  [23949, 7649, 7649]   560004
111  [7649, 23949, 7649]   627769
112  [19968, 7649, 7649]   427342
113  [7649, 19968, 7649]   478989
114   [7649, 7649, 7649]   480170

[115 rows x 2 columns]


In [28]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_75.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_75_translated.csv",index=False)

In [29]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 18
Max lengths of frequent patterns: 3


In [30]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_7.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.007])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_7.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 708364 ms
 Frequent sequences count : 130
 Max memory (mb) : 16545.53606414795130
minsup 368500
Intersection count 778 


                        pattern      sup
0                       [65472]   427039
1                       [53996]  1074492
2                       [49042]   523172
3                       [45846]  1590487
4                       [45327]   460532
..                          ...      ...
125         [7649, 7649, 19968]   371044
126         [19968, 7649, 7649]   427342
127         [7649, 19968, 7649]   478989
128          [7649, 7649, 7649]   480170
129  [23949, 7649, 23949, 7649]   392972

[130 rows x 2 columns]


In [31]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_7.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_7_translated.csv",index=False)

In [32]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 20
Max lengths of frequent patterns: 4


In [33]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_65.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0065])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_65.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 679218 ms
 Frequent sequences count : 136
 Max memory (mb) : 14354.177284240723136
minsup 342179
Intersection count 866 


                          pattern      sup
0                         [65472]   427039
1                         [59154]   357369
2                         [53996]  1074492
3                         [49042]   523172
4                         [45846]  1590487
..                            ...      ...
131           [7649, 7649, 19968]   371044
132           [7649, 19968, 7649]   478989
133            [7649, 7649, 7649]   480170
134  [53003, 19968, 53003, 19968]   344782
135    [23949, 7649, 23949, 7649]   392972

[136 rows x 2 columns]


In [34]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_65.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_65_translated.csv",index=False)

In [35]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 20
Max lengths of frequent patterns: 4


In [36]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_6.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.006])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_6.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 809714 ms
 Frequent sequences count : 146
 Max memory (mb) : 16835.67275238037146
minsup 315858
Intersection count 941 


                          pattern      sup
0                         [65472]   427039
1                         [59154]   357369
2                         [53996]  1074492
3                         [51657]   323309
4                         [49042]   523172
..                            ...      ...
141           [7649, 19968, 7649]   478989
142           [19968, 7649, 7649]   427342
143            [7649, 7649, 7649]   480170
144  [53003, 19968, 53003, 19968]   344782
145    [23949, 7649, 23949, 7649]   392972

[146 rows x 2 columns]


In [37]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_6.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_6_translated.csv",index=False)

In [38]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 21
Max lengths of frequent patterns: 4


In [39]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_55.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0055])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_55.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 707803 ms
 Frequent sequences count : 155
 Max memory (mb) : 14338.39582824707155
minsup 289536
Intersection count 1031 


                          pattern      sup
0                         [65472]   427039
1                         [59154]   357369
2                         [53996]  1074492
3                         [51657]   323309
4                         [49042]   523172
..                            ...      ...
150           [19968, 7649, 7649]   427342
151            [7649, 7649, 7649]   480170
152  [53003, 19968, 53003, 19968]   344782
153  [19968, 53003, 19968, 53003]   299234
154    [23949, 7649, 23949, 7649]   392972

[155 rows x 2 columns]


In [40]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_55.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_55_translated.csv",index=False)

In [41]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 23
Max lengths of frequent patterns: 4


In [42]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_5.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.005])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_5.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 766407 ms
 Frequent sequences count : 174
 Max memory (mb) : 14240.029014587402174
minsup 263215
Intersection count 1190 


                          pattern      sup
0                         [65472]   427039
1                         [59154]   357369
2                         [53996]  1074492
3                         [51657]   323309
4                         [49042]   523172
..                            ...      ...
169           [7649, 7649, 19968]   371044
170            [7649, 7649, 7649]   480170
171  [19968, 53003, 19968, 53003]   299234
172  [53003, 19968, 53003, 19968]   344782
173    [23949, 7649, 23949, 7649]   392972

[174 rows x 2 columns]


In [43]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_5.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_5_translated.csv",index=False)

In [44]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 27
Max lengths of frequent patterns: 4


In [45]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_45.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0045])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_45.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 897454 ms
 Frequent sequences count : 192
 Max memory (mb) : 15903.732650756836192
minsup 236893
Intersection count 1369 


                          pattern     sup
0                         [65472]  427039
1                         [59154]  357369
2                         [51657]  323309
3                         [50264]  244585
4                         [49042]  523172
..                            ...     ...
187  [19968, 53003, 19968, 53003]  299234
188  [53003, 19968, 53003, 19968]  344782
189   [23949, 7649, 23949, 23949]  256486
190    [23949, 7649, 23949, 7649]  392972
191    [7649, 23949, 7649, 23949]  259296

[192 rows x 2 columns]


In [46]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_45.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_45_translated.csv",index=False)

In [47]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 28
Max lengths of frequent patterns: 4


In [48]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_4.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.004])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_4.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 1008158 ms
 Frequent sequences count : 232
 Max memory (mb) : 16150.376747131348232
minsup 210572
Intersection count 1721 


                         pattern     sup
0                        [65472]  427039
1                        [59154]  357369
2                        [51657]  323309
3                        [50264]  244585
4                        [49042]  523172
..                           ...     ...
227  [23949, 23949, 7649, 23949]  234694
228  [19968, 23949, 7649, 23949]  224163
229   [23949, 7649, 7649, 23949]  225476
230   [7649, 23949, 7649, 23949]  259296
231   [23949, 7649, 23949, 7649]  392972

[232 rows x 2 columns]


In [49]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_4.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_4_translated.csv",index=False)

In [50]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 36
Max lengths of frequent patterns: 4


In [51]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_35.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0035])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_35.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 969329 ms
 Frequent sequences count : 250
 Max memory (mb) : 14406.721794128418250
minsup 184250
Intersection count 1929 


                               pattern     sup
0                              [65472]  427039
1                              [59154]  357369
2                              [51657]  323309
3                              [50264]  244585
4                              [49042]  523172
..                                 ...     ...
245        [19968, 23949, 7649, 23949]  224163
246        [19968, 23949, 7649, 19968]  206746
247        [19968, 19968, 7649, 19968]  186937
248         [19968, 7649, 19968, 7649]  194263
249  [23949, 7649, 23949, 7649, 23949]  185403

[250 rows x 2 columns]


In [52]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_35.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_35_translated.csv",index=False)

In [53]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 40
Max lengths of frequent patterns: 5


In [54]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_3.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.003])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_3.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 1070593 ms
 Frequent sequences count : 298
 Max memory (mb) : 14345.460800170898298
minsup 157929
Intersection count 2426 


                                 pattern     sup
0                                [65472]  427039
1                                [59154]  357369
2                                [58938]  176272
3                                [51657]  323309
4                                [50264]  244585
..                                   ...     ...
293           [19968, 7649, 7649, 19968]  160270
294           [19968, 7649, 19968, 7649]  194263
295           [7649, 19968, 7649, 19968]  170307
296  [53003, 19968, 53003, 19968, 53003]  166593
297    [23949, 7649, 23949, 7649, 23949]  185403

[298 rows x 2 columns]


In [55]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_3.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_3_translated.csv",index=False)

In [56]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 51
Max lengths of frequent patterns: 5


In [57]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_25.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0025])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_25.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 1187532 ms
 Frequent sequences count : 362
 Max memory (mb) : 14494.559265136719362
minsup 131608
Intersection count 3069 


                                 pattern     sup
0                                [66136]  135309
1                                [65472]  427039
2                                [59154]  357369
3                                [58938]  176272
4                                [54974]  135085
..                                   ...     ...
357            [7649, 7649, 23949, 7649]  148495
358             [7649, 7649, 7649, 7649]  150735
359  [53003, 19968, 53003, 19968, 53003]  166593
360  [19968, 53003, 19968, 53003, 19968]  134526
361    [23949, 7649, 23949, 7649, 23949]  185403

[362 rows x 2 columns]


In [58]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_25.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_25_translated.csv",index=False)

In [59]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 54
Max lengths of frequent patterns: 5


In [60]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_2.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.002])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_2.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 1404153 ms
 Frequent sequences count : 489
 Max memory (mb) : 16643.205459594727489
minsup 105286
Intersection count 4297 


                                 pattern     sup
0                                [66136]  135309
1                                [65472]  427039
2                                [65471]  128913
3                                [59154]  357369
4                                [58938]  176272
..                                   ...     ...
484  [53003, 19968, 53003, 19968, 19968]  105464
485  [19968, 53003, 19968, 53003, 19968]  134526
486    [23949, 7649, 23949, 7649, 23949]  185403
487    [23949, 7649, 23949, 23949, 7649]  125256
488     [7649, 23949, 7649, 23949, 7649]  108790

[489 rows x 2 columns]


In [61]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_2.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_2_translated.csv",index=False)

In [62]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 65
Max lengths of frequent patterns: 5


In [63]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_15.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0015])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_15.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 1679305 ms
 Frequent sequences count : 669
 Max memory (mb) : 15147.725204467773669
minsup 78965
Intersection count 6378 


                                        pattern     sup
0                                       [66375]   82713
1                                       [66136]  135309
2                                       [65957]  100274
3                                       [65471]  128913
4                                       [62624]   99542
..                                          ...     ...
664           [7649, 23949, 23949, 7649, 23949]   85827
665           [19968, 23949, 7649, 23949, 7649]   80700
666           [19968, 7649, 19968, 7649, 19968]   88479
667  [53003, 19968, 53003, 19968, 53003, 19968]   79195
668     [23949, 7649, 23949, 7649, 23949, 7649]   79597

[669 rows x 2 columns]


In [64]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_15.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_15_translated.csv",index=False)

In [65]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 73
Max lengths of frequent patterns: 6


In [66]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_05.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0005])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_05.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 6460759 ms
 Frequent sequences count : 2751
 Max memory (mb) : 18544.3089904785162751
minsup 26322
Intersection count 29234 


                                                pattern     sup
0                                               [66375]   82713
1                                               [66279]   28064
2                                               [65996]   51049
3                                               [65957]  100274
4                                               [65645]   33120
...                                                 ...     ...
2746  [19968, 53003, 19968, 53003, 19968, 19968, 53003]   27600
2747     [23949, 7649, 23949, 7649, 23949, 23949, 7649]   31011
2748     [23949, 7649, 23949, 7649, 23949, 7649, 23949]   43879
2749     [23949, 7649, 23949, 23949, 7649, 23949, 7649]   30979
2750      [7649, 23949, 7649, 23949, 7649, 23949, 7649]   27945

[2751 rows x 2 columns]


In [67]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_05.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_05_translated.csv",index=False)

In [68]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 137
Max lengths of frequent patterns: 7


In [69]:
spmf = Spmf("VMSP", input_filename="/home/s164574/motifs_and_competition_in_app_usage/20sec_sessions_0_01_removed_without_split.txt",
            output_filename="/home/s164574/motifs_and_competition_in_app_usage/output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_01.txt",
            spmf_bin_location_dir="/home/s164574/motifs_and_competition_in_app_usage/",
            arguments=[0.0001])
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_01.csv")

>/home/s164574/motifs_and_competition_in_app_usage/spmf.jar
 Total time ~ 40112624 ms
 Frequent sequences count : 24599
 Max memory (mb) : 25871.5230484008824599
minsup 5265
Intersection count 297671 


                                                                             pattern  \
0                                                                            [66887]   
1                                                                            [66280]   
2                                                                            [66279]   
3                                                                            [66241]   
4                                                                            [65630]   
...                                                                              ...   
24594             [23949, 7649, 23949, 7649, 23949, 23949, 7649, 23949, 7649, 23949]   
24595              [23949, 7649, 23949, 7649, 23949, 7649, 23949, 7649, 23949, 7649]   
24596

In [70]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_01.csv",delimiter=';')
output['pattern']=output.pattern.apply(lambda l: [app_dict[int(i)] for i in ast.literal_eval(l)])
output=output.sort_values(['sup'],ascending=False)
output['pattern_length']=output.pattern.apply(lambda l: len(l))
output.to_csv("output_VMSP_20sec_sessions_0_01_removed_without_split_min_sup_0_01_translated.csv",index=False)

In [71]:
print("Number of frequent patterns of length 1:",len(output[output.pattern_length==1]))
print("Max lengths of frequent patterns:",output.pattern_length.max())

Number of frequent patterns of length 1: 90
Max lengths of frequent patterns: 11


## Creating tunning graphs

In [2]:
tunning_results=pd.read_csv("Tunning_min_sup_results.csv")

In [10]:
tunning_results=tunning_results[1:]

In [11]:
tunning_results

Unnamed: 0,min. sup.(%),num of frequent patterns founded,max length of founded pattern,num of length 1 founded pattern,number sessions corresponds to min. sup.
1,0.01,19216.0,10.0,98.0,5283.8635
2,0.05,2320.0,7.0,137.0,26419.3175
3,0.1,1003.0,6.0,91.0,52838.635
4,0.15,618.0,5.0,72.0,79257.9525
5,0.2,441.0,5.0,65.0,105677.27
6,0.25,342.0,5.0,54.0,132096.5875
7,0.3,282.0,5.0,50.0,158515.905
8,0.35,245.0,4.0,39.0,184935.2225
9,0.4,210.0,4.0,34.0,211354.54
10,0.45,182.0,4.0,28.0,237773.8575


In [14]:
#fig1 =px.scatter(tunning_results,x='threshold(%)', y='num of infrequency apps',title = 'Number of infrequent apps')
fig1 =px.scatter(tunning_results,x='min. sup.(%)', y='number sessions corresponds to min. sup.',width=625,height=400)
fig2 =px.scatter(tunning_results,x='min. sup.(%)', y='num of frequent patterns founded',width=625,height=400)
fig3 =px.scatter(tunning_results,x='min. sup.(%)', y='max length of founded pattern',width=625,height=400)
fig4 = px.scatter(tunning_results,x='min. sup.(%)', y='num of length 1 founded pattern ',width=625,height=400)

In [18]:
fig1

In [15]:
fig2

In [16]:
fig3

In [17]:
fig4

In [16]:
figure1 =px.scatter(tunning_results,x='min. sup.(%)', y='num of frequent patterns founded',title = 'Number of frequent patterns')
figure2 =px.scatter(tunning_results,x='min. sup.(%)', y='max length of founded pattern',title = 'length of longest pattern')
figure3 = px.scatter(tunning_results,x='min. sup.(%)', y='num of length 1 founded pattern ',title='Number of patters of length 1')


figure1_traces = []
figure2_traces = []
figure3_traces = []
for trace in range(len(figure1["data"])):
    figure1_traces.append(figure1["data"][trace])
for trace in range(len(figure2["data"])):
    figure2_traces.append(figure2["data"][trace])
for trace in range(len(figure3["data"])):
    figure3_traces.append(figure3["data"][trace]) 
#Create a 1x2 subplot
this_figure = sp.make_subplots(rows=2, cols=2) 

# Get the Express fig broken down as traces and add the traces to the proper plot within in the subplot
for traces in figure1_traces:
    this_figure.append_trace(traces, row=1, col=1)
for traces in figure2_traces:
    this_figure.append_trace(traces, row=1, col=2)
for traces in figure3_traces:
    this_figure.append_trace(traces, row=2, col=1)

#the subplot as shown in the above ima

In [17]:
this_figure

The tuning does not give a clear indication of which min. sup. to use, but a realativly stable number which still ensure long maximal patterns, and a somewhat reasonable number of patterns are 0.1 which will therefore be the min. sup we chose to work with.

# Analysing result
To better interpret the results we take a look at which apps occur in the maximal patterns and how they compare to the ones we found using the observation coverage definition of infrequency.

In [2]:
output=pd.read_csv("output_VMSP_20sec_sessions_0_01_removed_min_sup_0_1_translated.csv")

In [3]:
print("Number of maximal patterns:",output.shape[0])
print("Longest pattern:",output.pattern_length.max())

Number of maximal patterns: 1003
Longest pattern: 6


In [4]:
output_app_count=pd.DataFrame(output["pattern"].str.replace("[\[\]\'\s]", "").str.split(',', expand=True).values).apply(pd.value_counts).sum(axis=1).sort_values()

  output_app_count=pd.DataFrame(output["pattern"].str.replace("[\[\]\'\s]", "").str.split(',', expand=True).values).apply(pd.value_counts).sum(axis=1).sort_values()


In [5]:
print("Number of unique apps in frequent patterns:",output_app_count.shape[0])

Number of unique apps in frequent patterns: 142


In [6]:
output_coverage=pd.read_csv("output_VMSP_20s_sessions_keep_96_min_sup_0_1_translated.csv")

In [7]:
print("Infrequency as observation coverage:")
print("Number of maximal patterns:",output_coverage.shape[0])
print("Longest pattern:",output_coverage.pattern_length.max())

Infrequency as observation coverage:
Number of maximal patterns: 1014
Longest pattern: 6


In [8]:
output_coverage_app_count=pd.DataFrame(output_coverage["pattern"].str.replace("[\[\]\'\s]", "").str.split(',', expand=True).values).apply(pd.value_counts).sum(axis=1).sort_values()

  output_coverage_app_count=pd.DataFrame(output_coverage["pattern"].str.replace("[\[\]\'\s]", "").str.split(',', expand=True).values).apply(pd.value_counts).sum(axis=1).sort_values()


In [9]:
output_coverage_app_count.shape[0]

142

In [10]:
set(output_coverage_app_count.index)-set(output_coverage_app_count.index)

set()

We see that they both contain the same apps. The next step is to see if they contain the same patterns.

In [11]:
print("Number of patterns which are is not not found with the coverage infrequency:",len(set(output.pattern)-set(output_coverage.pattern)))
print("Number of patterns which are is not not found with the percent infrequency:",len(set(output_coverage.pattern)-set(output.pattern)))
print("Number of patterns which occur in both:",len(set(output_coverage.pattern).intersection(set(output.pattern))))

Number of patterns which are is not not found with the coverage infrequency: 10
Number of patterns which are is not not found with the percent infrequency: 21
Number of patterns which occur in both: 993


We see that the two only differ by a few patterns while the majority of the patterns found with the two approaches are the same.\
We here take a look at the patterns which are only found to be frequent using one of the methods.

In [13]:
print("Patterns which are only found using the percentage dedintion of infrequency:")
output[output.pattern.isin(set(output.pattern)-set(output_coverage.pattern))]

Patterns which are only found using the percentage dedintion of infrequency:


Unnamed: 0,pattern,sup,pattern_length
83,"['com.facebook.orca', 'com.facebook.katana', 'com.instagram.android']",142916,3
94,"['com.whatsapp', 'com.facebook.katana', 'com.snapchat.android']",138458,3
152,"['com.facebook.katana', 'com.whatsapp', 'com.snapchat.android']",119715,3
645,"['com.whatsapp', 'com.whatsapp', 'com.facebook.katana', 'com.facebook.katana', 'com.whatsapp']",65805,5
992,"['com.sonymobile.email', 'com.google.android.apps.docs']",53111,2
995,"['com.google.android.googlequicksearchbox', 'com.sonyericsson.android.socialphonebook', 'com.android.incallui']",53072,3
998,"['com.sonyericsson.music', 'com.google.android.youtube']",53005,2
999,"['com.instagram.android', 'com.android.incallui']",52990,2
1001,"['com.google.android.apps.maps', 'com.google.android.youtube']",52881,2
1002,"['com.google.android.googlequicksearchbox', 'com.google.android.apps.messaging']",52845,2


In [14]:
print("Patterns which are only found using the coverage definition of infrequency:")
output_coverage[output_coverage.pattern.isin(set(output_coverage.pattern)-set(output.pattern))]

Patterns which are only found using the coverage definition of infrequency:


Unnamed: 0,pattern,sup,pattern_length
264,"['com.google.android.googlequicksearchbox', 'com.android.incallui']",174048,2
318,"['com.google.android.googlequicksearchbox', 'com.sonyericsson.android.socialphonebook']",112945,2
475,"['com.instagram.android', 'com.facebook.katana', 'com.twitter.android']",54232,3
477,"['com.google.android.gm', 'com.google.android.googlequicksearchbox', 'com.facebook.katana']",54317,3
478,"['com.android.chrome', 'com.whatsapp', 'com.google.android.youtube']",54464,3
482,"['com.whatsapp', 'com.whatsapp', 'com.google.android.apps.maps']",54557,3
483,"['com.google.android.googlequicksearchbox', 'com.instagram.android', 'com.google.android.googlequicksearchbox']",54566,3
484,"['com.android.chrome', 'com.sonymobile.email', 'com.sonymobile.email']",54575,3
485,"['com.google.android.googlequicksearchbox', 'com.facebook.orca', 'com.facebook.orca']",54840,3
490,"['com.google.android.googlequicksearchbox', 'com.sonyericsson.conversations', 'com.sonyericsson.conversations']",54943,3


Since they the results only show maximal patterns, it is interesting if any of these non overlaping maximal patterns, do occur as non-maximal frequent patterns in the other.

In [20]:
output[output.pattern.isin(set(output.pattern)-set(output_coverage.pattern))].pattern.str.replace(r'[\[\]\']','')

  output[output.pattern.isin(set(output.pattern)-set(output_coverage.pattern))].pattern.str.replace(r'[\[\]\']','')


83                                                com.facebook.orca, com.facebook.katana, com.instagram.android
94                                                      com.whatsapp, com.facebook.katana, com.snapchat.android
152                                                     com.facebook.katana, com.whatsapp, com.snapchat.android
645                          com.whatsapp, com.whatsapp, com.facebook.katana, com.facebook.katana, com.whatsapp
992                                                          com.sonymobile.email, com.google.android.apps.docs
995     com.google.android.googlequicksearchbox, com.sonyericsson.android.socialphonebook, com.android.incallui
998                                                          com.sonyericsson.music, com.google.android.youtube
999                                                                 com.instagram.android, com.android.incallui
1001                                                   com.google.android.apps.maps, com.google.android.

In [50]:
output[output.pattern.isin(set(output.pattern)-set(output_coverage.pattern))]

Unnamed: 0,pattern,sup,pattern_length
83,"['com.facebook.orca', 'com.facebook.katana', 'com.instagram.android']",142916,3
94,"['com.whatsapp', 'com.facebook.katana', 'com.snapchat.android']",138458,3
152,"['com.facebook.katana', 'com.whatsapp', 'com.snapchat.android']",119715,3
645,"['com.whatsapp', 'com.whatsapp', 'com.facebook.katana', 'com.facebook.katana', 'com.whatsapp']",65805,5
992,"['com.sonymobile.email', 'com.google.android.apps.docs']",53111,2
995,"['com.google.android.googlequicksearchbox', 'com.sonyericsson.android.socialphonebook', 'com.android.incallui']",53072,3
998,"['com.sonyericsson.music', 'com.google.android.youtube']",53005,2
999,"['com.instagram.android', 'com.android.incallui']",52990,2
1001,"['com.google.android.apps.maps', 'com.google.android.youtube']",52881,2
1002,"['com.google.android.googlequicksearchbox', 'com.google.android.apps.messaging']",52845,2


In [76]:
output_coverage['pattern']=output_coverage.pattern.str.replace(r'[\[\]\']','')
output['pattern']=output.pattern.str.replace(r'[\[\]\']','')

  output_coverage['pattern']=output_coverage.pattern.str.replace(r'[\[\]\']','')
  output['pattern']=output.pattern.str.replace(r'[\[\]\']','')


In [83]:
maximal_patterns_not_with_coverage=output[output.pattern.isin(set(output.pattern)-set(output_coverage.pattern))].\
join(pd.DataFrame(output[output.pattern.isin(set(output.pattern)-set(output_coverage.pattern))].pattern.\
apply(lambda l:output_coverage.pattern.str.contains(l).any())).rename(columns={'pattern':'in maximal'}))
maximal_patterns_not_with_coverage

Unnamed: 0,pattern,sup,pattern_length,in maximal
83,"com.facebook.orca, com.facebook.katana, com.instagram.android",142916,3,True
94,"com.whatsapp, com.facebook.katana, com.snapchat.android",138458,3,False
152,"com.facebook.katana, com.whatsapp, com.snapchat.android",119715,3,True
645,"com.whatsapp, com.whatsapp, com.facebook.katana, com.facebook.katana, com.whatsapp",65805,5,False
992,"com.sonymobile.email, com.google.android.apps.docs",53111,2,False
995,"com.google.android.googlequicksearchbox, com.sonyericsson.android.socialphonebook, com.android.incallui",53072,3,False
998,"com.sonyericsson.music, com.google.android.youtube",53005,2,False
999,"com.instagram.android, com.android.incallui",52990,2,False
1001,"com.google.android.apps.maps, com.google.android.youtube",52881,2,False
1002,"com.google.android.googlequicksearchbox, com.google.android.apps.messaging",52845,2,False


In [84]:
maximal_patterns_not_with_percentage=output_coverage[output_coverage.pattern.isin(set(output_coverage.pattern)-set(output.pattern))].\
join(pd.DataFrame(output_coverage[output_coverage.pattern.isin(set(output_coverage.pattern)-set(output.pattern))].\
pattern.apply(lambda l:output.pattern.str.contains(l).any())).rename(columns={'pattern':'in maximal'}))
maximal_patterns_not_with_percentage

Unnamed: 0,pattern,sup,pattern_length,in maximal
264,"com.google.android.googlequicksearchbox, com.android.incallui",174048,2,False
318,"com.google.android.googlequicksearchbox, com.sonyericsson.android.socialphonebook",112945,2,True
475,"com.instagram.android, com.facebook.katana, com.twitter.android",54232,3,False
477,"com.google.android.gm, com.google.android.googlequicksearchbox, com.facebook.katana",54317,3,False
478,"com.android.chrome, com.whatsapp, com.google.android.youtube",54464,3,False
482,"com.whatsapp, com.whatsapp, com.google.android.apps.maps",54557,3,False
483,"com.google.android.googlequicksearchbox, com.instagram.android, com.google.android.googlequicksearchbox",54566,3,False
484,"com.android.chrome, com.sonymobile.email, com.sonymobile.email",54575,3,False
485,"com.google.android.googlequicksearchbox, com.facebook.orca, com.facebook.orca",54840,3,False
490,"com.google.android.googlequicksearchbox, com.sonyericsson.conversations, com.sonyericsson.conversations",54943,3,False


In [107]:
maximal_patterns_not_with_coverage[maximal_patterns_not_with_coverage['in maximal']!=True][['0','1','2','3','4']]=\
maximal_patterns_not_with_coverage[maximal_patterns_not_with_coverage['in maximal']!=True].pattern.str.split(',',expand=True).to_csv("unique_maximal_patterns_not_with_coverage.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value


In [108]:
maximal_patterns_not_with_percentage[maximal_patterns_not_with_percentage['in maximal']!=True][['0','1','2','3','4','5']]=\
maximal_patterns_not_with_percentage[maximal_patterns_not_with_percentage['in maximal']!=True].pattern.str.split(',',expand=True).to_csv("unique_maximal_patterns_not_with_percentage.csv",index=False)