# Find thresholds for association rules

Instructions: 

Depending on if run the notebook locally on in a cloud drive: 
* Replace the proj_name and proj_data_folder in the Configuration section
* Replace the cloud drive folder in the Configuration section 

## Configuration

In [14]:
proj_name = 'glucosio-android' # 'PX4-Autopilot' #'PROJ_NAME'
proj_data_folder = '../project_results/' + proj_name + '/'

### If you run this notebook in google colaboratory, configure this block.
You will have to copy the generated database, the folders "notebooks" and "analytics". 

In [None]:
from google.colab import drive
import os

GDRIVE_FOLDER = 'CCSD/codeChangeInCG'

drive.mount('/gdrive')
# the project's folder
drive_folder = '/gdrive/My Drive/' + GDRIVE_FOLDER
os.chdir(drive_folder)
print(os.getcwd())

## Imports

In [2]:
!pip install mlxtend

#!pip install python-stopwatch
# https://github.com/rasbt/mlxtend
# http://rasbt.github.io/mlxtend/



In [6]:
import sys
import os
import pandas as pd
import sqlite3
from pathlib import Path

from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

# works with lists, not pandas, no nan values, apostrophe between values of transaction

# Parameters:
# transactions: typing.Iterable[typing.Union[set, tuple, list]],
# min_support: float = 0.5,
# min_confidence: float = 0.5,
# max_length: int = 8,
# verbosity: int = 0,
# output_transaction_ids: bool = False,

In [4]:
# CCSD libraries
analytics_folder_path = str(Path.cwd().parents[0] / "analytics")
sys.path.append(analytics_folder_path)

from association_rules_thresholds import *

In [None]:
# Reloads
import importlib
import association_rules_thresholds

importlib.reload(association_rules_thresholds)
from association_rules_thresholds import *

## Database connections

In [15]:
ANALYTICS_DB_PATH =  proj_data_folder + proj_name + '_analytics.db'
print(ANALYTICS_DB_PATH)
print(os.path.isfile(ANALYTICS_DB_PATH))
con_analytics_db = sqlite3.connect(ANALYTICS_DB_PATH)

../project_results/glucosio-android/glucosio-android_analytics.db
True


# On commit and file level

In [16]:
# Get number of rules by threshold with default values
rt = get_rules_by_threshold_on_commit_and_file(con_analytics_db)
for r in rt:
    print(r)

[0.5, 1]
[0.46, 1]
[0.42, 1]
[0.38, 1]
[0.33999999999999997, 1]
[0.3, 1]
[0.26, 34]
[0.21999999999999997, 34]
[0.18, 34]
[0.14, 669]


In [17]:
# Get number of rules by threshold non default values
rt = get_rules_by_threshold_on_commit_and_file(con_analytics_db, min_t=0.04, max_t=0.5, n=11, max_nr_rules=250)
for r in rt:
    print(r)

[0.5, 1]
[0.454, 1]
[0.40800000000000003, 1]
[0.362, 1]
[0.316, 1]
[0.27, 34]
[0.22399999999999998, 34]
[0.178, 34]
[0.132, 669]


## Extended: On month and file level

In [19]:
# for processing mlxtend apriori
sql_statement = """select
        file_name,
        strftime('%Y', date(commit_commiter_datetime)) as iso_yr,
        strftime('%m', date(commit_commiter_datetime)) as iso_month,
        count(*) as changes_in_month
        from file_commit
        group by 
        file_name,
        strftime('%Y', date(commit_commiter_datetime)),
        strftime('%m', date(commit_commiter_datetime))"""
dfsql = pd.read_sql_query(sql_statement, con_analytics_db)
dfsql['yr_m'] = dfsql.apply(lambda row: ''.join(
            [str(row.iso_yr), '-', str(row.iso_month)]), axis=1)
df_hash = dfsql.groupby('yr_m')['file_name'].apply(list)
print(df_hash.head(5)) 

yr_m
2018-06    [AddA1CActivity.java, AddCholesterolActivity.j...
2018-07    [A1cCalculatorActivity.java, AddGlucoseActivit...
Name: file_name, dtype: object


In [20]:
# generate scarce matrix
te = TransactionEncoder()
oht_ary = te.fit(df_hash).transform(df_hash, sparse=True)
sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_)
#sparse_df

In [21]:
# Get number of rules by threshold
rt = get_rules_by_threshold(sparse_df, min_t=0.2, max_t=0.3, n=11, max_nr_rules=250)
for r in rt:
    print(r)

[0.3, 4215]
