Our first step is to download a piece of text from Wikipedia and to parse paragraphs.

In [2]:
from bs4 import BeautifulSoup
import requests

respond = requests.get("https://en.wikipedia.org/wiki/Poznań")
soup = BeautifulSoup(respond.text, "lxml")
page = soup.find_all('p')

raw_text = [paragraph.text for paragraph in page]

print(raw_text)

['\n', "Poznań (Polish:\xa0[ˈpɔznaɲ] (listen))[a] is a city on the River Warta in west-central Poland, within the Greater Poland region. The city is an important cultural and business centre, and one of Poland's most populous regions with many regional customs such as Saint John's Fair (Jarmark Świętojański), traditional Saint Martin's croissants and a local dialect. Among its most important heritage sites are the Renaissance Old Town, Town Hall and Gothic Cathedral.\n", "Poznań is the fifth-largest and one of the oldest cities in Poland. As of 2021, the city's population is 529,410, while the Poznań metropolitan area (Metropolia Poznań) comprising Poznań County and several other communities is inhabited by over 1.1\xa0million people.[2] It is one of four historical capitals of medieval Poland and the ancient capital of the Greater Poland region, currently the administrative capital of the province called Greater Poland Voivodeship.\n", "Poznań is a center of trade, sports, education, 

Next, we will split the text into paragraphs and remove the lines with less than 3 words.

In [3]:
text = [ line.split() for line in raw_text if len(line) > 2 ]

for line in text[:10]:
    print(line)

['Poznań', '(Polish:', '[ˈpɔznaɲ]', '(listen))[a]', 'is', 'a', 'city', 'on', 'the', 'River', 'Warta', 'in', 'west-central', 'Poland,', 'within', 'the', 'Greater', 'Poland', 'region.', 'The', 'city', 'is', 'an', 'important', 'cultural', 'and', 'business', 'centre,', 'and', 'one', 'of', "Poland's", 'most', 'populous', 'regions', 'with', 'many', 'regional', 'customs', 'such', 'as', 'Saint', "John's", 'Fair', '(Jarmark', 'Świętojański),', 'traditional', 'Saint', "Martin's", 'croissants', 'and', 'a', 'local', 'dialect.', 'Among', 'its', 'most', 'important', 'heritage', 'sites', 'are', 'the', 'Renaissance', 'Old', 'Town,', 'Town', 'Hall', 'and', 'Gothic', 'Cathedral.']
['Poznań', 'is', 'the', 'fifth-largest', 'and', 'one', 'of', 'the', 'oldest', 'cities', 'in', 'Poland.', 'As', 'of', '2021,', 'the', "city's", 'population', 'is', '529,410,', 'while', 'the', 'Poznań', 'metropolitan', 'area', '(Metropolia', 'Poznań)', 'comprising', 'Poznań', 'County', 'and', 'several', 'other', 'communities', '

Our text still contains a lot of stop-words and some additional tokens such as 1.2, [2], etc. We will use the `nltk` library to remove the stop-words and we'll transform everything to alpha tokens.

In [5]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mateusz\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
from nltk.corpus import stopwords

stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [7]:
clean_text = [
    [ 
        word.lower() 
        for word 
        in line 
        if word.isalpha() 
        and word.lower() not in stopwords.words('english') 
    ]
    for line 
    in text
]

for line in clean_text[:10]:
    print(line)

['poznań', 'city', 'river', 'warta', 'within', 'greater', 'poland', 'city', 'important', 'cultural', 'business', 'one', 'populous', 'regions', 'many', 'regional', 'customs', 'saint', 'fair', 'traditional', 'saint', 'croissants', 'local', 'among', 'important', 'heritage', 'sites', 'renaissance', 'old', 'town', 'hall', 'gothic']
['poznań', 'one', 'oldest', 'cities', 'population', 'poznań', 'metropolitan', 'area', 'comprising', 'poznań', 'county', 'several', 'communities', 'inhabited', 'million', 'one', 'four', 'historical', 'capitals', 'medieval', 'poland', 'ancient', 'capital', 'greater', 'poland', 'currently', 'administrative', 'capital', 'province', 'called', 'greater', 'poland']
['poznań', 'center', 'technology', 'important', 'academic', 'students', 'adam', 'mickiewicz', 'third', 'largest', 'polish', 'city', 'serves', 'seat', 'oldest', 'polish', 'one', 'populous', 'catholic', 'archdioceses', 'city', 'also', 'hosts', 'poznań', 'international', 'fair', 'biggest', 'industrial', 'fair', 

Now we are ready to transform the list of lists into the format suitable for association rule mining, i.e., to transform the input lists into boolean flags.

In [8]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

te = TransactionEncoder()
te_array = te.fit(clean_text).transform(clean_text)

In [9]:
# te_array contains binary version of the input data

te_array

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [10]:
te_array.shape

(92, 1343)

In [11]:
# original tokens are preserved in the columns_ field

te.columns_[:10]

['ab',
 'academic',
 'academy',
 'access',
 'according',
 'accounted',
 'achieving',
 'acoustics',
 'acquire',
 'acquired']

`mlxtend` package assumes that the input data are stored as a `pandas.DataFrame`

In [12]:
df = pd.DataFrame(te_array, columns=te.columns_)

df.head()

Unnamed: 0,ab,academic,academy,access,according,accounted,achieving,acoustics,acquire,acquired,...,zones,zoo,zoological,łacina,ławica,łazarz,łódż,śródka,święty,żabikowo
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


Now we are ready to find frequent collections of words.

In [13]:
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.054348,(adam)
1,0.271739,(also)
2,0.065217,(although)
3,0.086957,(among)
4,0.119565,(area)
...,...,...
234,0.076087,"(polish, city, also, poznań)"
235,0.054348,"(polish, many, poznań, also)"
236,0.054348,"(poland, city, poznań, capital)"
237,0.065217,"(capital, poznań, greater, poland)"


We can also mine association rules which will have additional measures of quality and interestingness

In [14]:
from mlxtend.frequent_patterns import association_rules

?association_rules

[1;31mSignature:[0m
[0massociation_rules[0m[1;33m([0m[1;33m
[0m    [0mdf[0m[1;33m,[0m[1;33m
[0m    [0mmetric[0m[1;33m=[0m[1;34m'confidence'[0m[1;33m,[0m[1;33m
[0m    [0mmin_threshold[0m[1;33m=[0m[1;36m0.8[0m[1;33m,[0m[1;33m
[0m    [0msupport_only[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Generates a DataFrame of association rules including the
metrics 'score', 'confidence', and 'lift'

Parameters
-----------
df : pandas DataFrame
  pandas DataFrame of frequent itemsets
  with columns ['support', 'itemsets']

metric : string (default: 'confidence')
  Metric to evaluate if a rule is of interest.
  **Automatically set to 'support' if `support_only=True`.**
  Otherwise, supported metrics are 'support', 'confidence', 'lift',
  'leverage', 'conviction' and 'zhangs_metric'
  These metrics are computed as follows:

  - support(A->C) = support(A+C) [aka 'support'], range: [0, 1]

  - con

In [15]:
from mlxtend.frequent_patterns import association_rules

association_rules(frequent_itemsets, 
                  metric='confidence', 
                  min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(mickiewicz),(adam),0.054348,0.054348,0.054348,1.000000,18.400000,0.051394,inf,1.000000
1,(adam),(mickiewicz),0.054348,0.054348,0.054348,1.000000,18.400000,0.051394,inf,1.000000
2,(adam),(poznań),0.054348,0.641304,0.054348,1.000000,1.559322,0.019494,inf,0.379310
3,(also),(poznań),0.271739,0.641304,0.206522,0.760000,1.185085,0.032254,1.494565,0.214454
4,(although),(poznań),0.065217,0.641304,0.054348,0.833333,1.299435,0.012524,2.152174,0.246512
...,...,...,...,...,...,...,...,...,...,...
150,"(capital, greater)","(poland, poznań)",0.065217,0.173913,0.065217,1.000000,5.750000,0.053875,inf,0.883721
151,"(capital, poland)","(poznań, greater)",0.086957,0.086957,0.065217,0.750000,8.625000,0.057656,3.652174,0.968254
152,"(poznań, greater)","(capital, poland)",0.086957,0.086957,0.065217,0.750000,8.625000,0.057656,3.652174,0.968254
153,"(poland, greater)","(capital, poznań)",0.086957,0.119565,0.065217,0.750000,6.272727,0.054820,3.521739,0.920635


In [16]:
association_rules(frequent_itemsets, metric='lift', min_threshold=5.0)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(mickiewicz),(adam),0.054348,0.054348,0.054348,1.000000,18.400000,0.051394,inf,1.000000
1,(adam),(mickiewicz),0.054348,0.054348,0.054348,1.000000,18.400000,0.051394,inf,1.000000
2,(capital),(greater),0.119565,0.097826,0.065217,0.545455,5.575758,0.053521,1.984783,0.932099
3,(greater),(capital),0.097826,0.119565,0.065217,0.666667,5.575758,0.053521,2.641304,0.909639
4,(population),(german),0.108696,0.086957,0.054348,0.500000,5.750000,0.044896,1.826087,0.926829
...,...,...,...,...,...,...,...,...,...,...
57,"(poland, poznań)","(capital, greater)",0.173913,0.065217,0.065217,0.375000,5.750000,0.053875,1.495652,1.000000
58,"(poland, greater)","(capital, poznań)",0.086957,0.119565,0.065217,0.750000,6.272727,0.054820,3.521739,0.920635
59,(capital),"(poland, poznań, greater)",0.119565,0.086957,0.065217,0.545455,6.272727,0.054820,2.008696,0.954733
60,(greater),"(capital, poznań, poland)",0.097826,0.086957,0.065217,0.666667,7.666667,0.056711,2.739130,0.963855


Both frequent itemsets and association rules (antecedens and consequents) are returned as `frozenset`s, so we can use [standard API calls](https://docs.python.org/3/library/stdtypes.html#set-types-set-frozenset) to find subsets, supersets, etc.

In [22]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

capital_idx =  rules['antecedents'].apply(lambda x: x.issuperset({'warta'}))
rules[capital_idx]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
52,(warta),(city),0.097826,0.445652,0.054348,0.555556,1.246612,0.010751,1.247283,0.219277
105,(warta),(poznań),0.097826,0.641304,0.076087,0.777778,1.212806,0.013351,1.61413,0.194492
109,(warta),(river),0.097826,0.065217,0.054348,0.555556,8.518519,0.047968,2.103261,0.978313
196,"(city, warta)",(river),0.054348,0.065217,0.054348,1.0,15.333333,0.050803,inf,0.988506
197,"(river, warta)",(city),0.054348,0.445652,0.054348,1.0,2.243902,0.030128,inf,0.586207
199,(warta),"(city, river)",0.097826,0.065217,0.054348,0.555556,8.518519,0.047968,2.103261,0.978313
