## Problem Statement

List and describe some other interesting features of this data set.

In [109]:
import numpy as np
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

Read the word difficulty classification data and break down words into characters.

In [110]:
df = pd.read_excel("./dataset/difficulty_classification.xlsx", index_col=0, usecols="A,B,C")
df_chars = pd.DataFrame(index=df.index, data=[ch for ch in df["word"].apply(func=list).values], columns=["ch1", "ch2", "ch3", "ch4", "ch5"])
df[df_chars.columns] = df_chars
del df["word"]

Transform the data into a boolean values list.

In [111]:
encoder = TransactionEncoder()
df_vals = df.values
df_vals[:, 0] = df_vals[:, 0].astype("str")
encoder_arr = encoder.fit(df_vals).transform(df_vals)
df_arr = pd.DataFrame(encoder_arr, columns=encoder.columns_)

Use the Apriori algorithm to discover frequent patterns of length 2:

In [114]:
itemsets = apriori(df_arr, min_support=0.04, use_colnames=True)
itemsets["length"] = itemsets["itemsets"].apply(lambda x: len(x))
for _, itemset, _  in itemsets[itemsets.length == 3].values:
    itemlist = list(itemset)
    if itemlist[0].isdigit() or itemlist[1].isdigit() or itemlist[2].isdigit():
        print(itemlist)

['0', 'e', 'a']
['0', 'r', 'a']
['0', 'r', 'e']
['1', 'e', 't']
['4', 'e', 'a']
['s', '4', 'e']
['4', 'e', 't']


Use the Apriori algorithm to discover frequent patterns of length 3:

In [115]:
itemsets = apriori(df_arr, min_support=0.02, use_colnames=True)
itemsets["length"] = itemsets["itemsets"].apply(lambda x: len(x))
for _, itemset, _  in itemsets[itemsets.length == 4].values:
    itemlist = list(itemset)
    if itemlist[0].isdigit() or itemlist[1].isdigit() or itemlist[2].isdigit() or itemlist[3].isdigit():
        print(itemlist)

['0', 'r', 'e', 'a']
