In [None]:
%run dataFormating.ipynb

# What subsets of scientific questions tend to be answered correctly by the same subjects?

## Mining

In [None]:
from orangecontrib.associate.fpgrowth import *  
import pandas as pd
from numpy import *

In [None]:
questions = correctedScientific.columns
correctedScientificText = [[] for _ in range(correctedScientific.shape[0])]
for q in questions:
    for r in range(correctedScientific.shape[0]):
        if correctedScientific.loc[r, q]:
            correctedScientificText[r].append(q)
#correctedScientificText

In [None]:
# Get frequent itemsets with support > 25%
# run time < 1 min
itemsets = frequent_itemsets(correctedScientificText, math.floor(len(correctedScientificText) * 0.25))
#dict(itemsets)

In [None]:
# Generate rules according to confidence, confidence > 85 %
# run time < 5 min
rules = association_rules(dict(itemsets), 0.85)
#list(rules)

In [None]:
# Transform rules generator into a Dataframe
rulesDataframe = pd.DataFrame([(ant, cons, supp, conf) for ant, cons, supp, conf in rules])
rulesDataframe.rename(columns = {0:"antecedants", 1:"consequents", 2:"support", 3:"confidence"}, inplace=True)
rulesDataframe.head()

In [None]:
# Save the mined rules to file
rulesDataframe.to_csv("results/associationRulesMiningSupport25percentsConfidence85percents.csv")

## Search for interesting rules
Interesting rules are more likely to be the ones with highest confidence, the highest lift or with a bigger consequent set. Pairs can also be especially interesting

In [None]:
# Sort rules by confidence
confidenceSortedRules = rulesDataframe.sort_values(by = ["confidence", "support"], ascending=[False, False])
confidenceSortedRules.head(50)

In [None]:
# Sort rules by size of consequent set
rulesDataframe["consequentSize"] = rulesDataframe["consequents"].apply(lambda x: len(x))
consequentSortedRules = rulesDataframe.sort_values(by = ["consequentSize", "confidence", "support"], ascending=[False, False, False])
consequentSortedRules.head(50)

In [None]:
# Select only pairs (rules with antecedent and consequent of size one)
# Sort pairs according to confidence
rulesDataframe["fusedRule"] = rulesDataframe[["antecedants", "consequents"]].apply(lambda x: frozenset().union(*x), axis=1)
rulesDataframe["ruleSize"] = rulesDataframe["fusedRule"].apply(lambda x: len(x))
pairRules = rulesDataframe.sort_values(by=["ruleSize", "confidence", "support"], ascending=[True, False, False])
pairRules.head(30)

In [None]:
# Sort questions by number of apparition in consequents
questions = list(correctedScientific.columns)
for q in questions:
    rulesDataframe[q+"c"] = rulesDataframe["consequents"].apply(lambda x: 1 if q in x else 0)
occurenceInConsequents = rulesDataframe.loc[:,"Q1c":"Q27c"].sum(axis=0)
occurenceInConsequents.sort_values(inplace=True, ascending=False)
occurenceInConsequents

In [None]:
# Sort questions by number of apparition in antecedants
for q in questions:
    rulesDataframe[q+"a"] = rulesDataframe["antecedants"].apply(lambda x: 1 if q in x else 0)
occurenceInAntecedants = rulesDataframe.loc[:,"Q1a":"Q27a"].sum(axis=0)
occurenceInAntecedants.sort_values(inplace=True, ascending=False)
occurenceInAntecedants