# Find thresholds for association rules

Instructions: 

* Replace the proj_name and proj_datafolder in the Configuration section
* [optional] replace the cloud drive folder in the Configuration section 

## Configuration

In [1]:
proj_name = 'glucosio-android' #'PROJ_NAME'
proj_data_folder = './ex_ubuntu/'

GDRIVE_FOLDER = 'callgraphCA/codeChangeInCG'

In [2]:
from google.colab import drive
import os
 
drive.mount('/gdrive')
# the project's folder
drive_folder = '/gdrive/My Drive/' + GDRIVE_FOLDER
os.chdir(drive_folder)
print(os.getcwd())

Mounted at /gdrive
/gdrive/My Drive/callgraphCA/codeChangeInCG


## Imports

In [3]:
!pip install mlxtend

#!pip install python-stopwatch
# https://github.com/rasbt/mlxtend
# http://rasbt.github.io/mlxtend/



In [4]:
import pandas as pd
import sqlite3

from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

# works with lists, not pandas, no nan values, apostrophe between values of transaction

# Parameters:
# transactions: typing.Iterable[typing.Union[set, tuple, list]],
# min_support: float = 0.5,
# min_confidence: float = 0.5,
# max_length: int = 8,
# verbosity: int = 0,
# output_transaction_ids: bool = False,

In [5]:
# callgraphCA libraries
from analytics.association_rules_thresholds import *

In [25]:
import importlib
import analytics.association_rules_thresholds

importlib.reload(analytics.association_rules_thresholds)
from analytics.association_rules_thresholds import *

## Database connections

In [9]:
ANALYTICS_DB_PATH =  proj_data_folder + proj_name + '_analytics.db'
print(ANALYTICS_DB_PATH)
os.path.exists(ANALYTICS_DB_PATH)
con_analytics_db = sqlite3.connect(ANALYTICS_DB_PATH)

./ex_ubuntu/glucosio-android_analytics.db


In [None]:
#cur = con_analytics_db.cursor()

# On commit and file level

In [26]:
# Get number of rules by threshold with default values
rt = get_rules_by_threshold_on_commit_and_file(con_analytics_db)
for r in rt:
    print(r)

[0.5, 0]
[0.46, 0]
[0.42, 0]
[0.38, 0]
[0.33999999999999997, 0]
[0.3, 0]
[0.26, 1]
[0.21999999999999997, 1]
[0.18, 1]
[0.14, 3]
[0.1, 5]


In [27]:
# Get number of rules by threshold non default values
rt = get_rules_by_threshold_on_commit_and_file(con_analytics_db, min_t=0.04, max_t=0.5, n=11, max_nr_rules=250)
for r in rt:
    print(r)

[0.5, 0]
[0.454, 0]
[0.40800000000000003, 0]
[0.362, 0]
[0.316, 0]
[0.27, 1]
[0.22399999999999998, 1]
[0.178, 1]
[0.132, 3]
[0.08600000000000002, 6]
[0.04, 24]


## Extended: On month and file level

In [22]:
# for processing mlxtend apriori
sql_statement = """select
        file_name,
        strftime('%Y', date(commit_commiter_datetime)) as iso_yr,
        strftime('%m', date(commit_commiter_datetime)) as iso_month,
        count(*) as changes_in_month
        from file_commit
        group by 
        file_name,
        strftime('%Y', date(commit_commiter_datetime)),
        strftime('%m', date(commit_commiter_datetime))"""
dfsql = pd.read_sql_query(sql_statement, con_analytics_db)
dfsql['yr_m'] = dfsql.apply(lambda row: ''.join(
            [str(row.iso_yr), '-', str(row.iso_month)]), axis=1)
df_hash = dfsql.groupby('yr_m')['file_name'].apply(list)
print(df_hash.head(3)) 

yr_m
2015-08    [ApplicationTest.java, DatabaseHandler.java, D...
2015-09    [DatabaseHandler.java, FormatDateTime.java, Gi...
2015-10    [ActionTip.java, AssistantAdapter.java, Assist...
Name: file_name, dtype: object


In [23]:
# generate scarce matrix
te = TransactionEncoder()
oht_ary = te.fit(df_hash).transform(df_hash, sparse=True)
sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_)
#sparse_df

In [28]:
# Get number of rules by threshold
rt = get_rules_by_threshold(sparse_df, min_t=0.04, max_t=0.5, n=11, max_nr_rules=250)
for r in rt:
    print(r)

[0.5, 18]
[0.454, 31]
[0.40800000000000003, 51]
[0.362, 87]
[0.316, 439]
