# Libraries

In [1]:
import numpy as np
import Levenshtein
from sklearn.cluster import AgglomerativeClustering
import templater3
import re
from nltk.corpus import stopwords
import pandas as pd

# Dataset

In [2]:
line = open('simplewiki-20211120-lists-1k.tsv').readlines()[9]
lines = open('simplewiki-20211120-lists-1k.tsv').readlines()
    
page, code = line.split('\t')
rows = code.replace('\\n','\n').splitlines()
line

'Art\t* [[Modern art]]\\n* [[Abstract art]]\\n* [[Painting]]\\n* [[Sculpture]]\\n* [[Street art]]\n'

In [3]:
# Recursive function
def split_set(cells, func):
    c1 = [c for c in cells if func(c)]
    c2 = [c for c in cells if not func(c)]
    return c1, c2

In [4]:
def parse_template(template, row):
    try:
        return template.parse(row)
    except ValueError:
        pass

In [5]:
def get_pattern(rows, func, depth=0):
    # Learn template
    template = templater3.Templater(min_block_size=2)
    for row in rows:
        template.learn(row)
        
    # Parse rows using template to make columns = transposing
    parsed = parse_template(template, row)
    
    columns = list(zip(*parsed))
    
    for idx, column in enumerate(columns):
        if not any(column):
            template._template[idx*2] = ''
            continue
        
        splits = split_set(column, func)
        
        if any(not c for c in splits):
            template._template[idx*2] = ''
        try:
            template._template[idx*2] = tuple([
                get_pattern(c, func, depth=depth+1)
                for c in splits
            ])
        except IndexError:
            pass
            
    return template._template

# Automate info extract

In [6]:
# heuristic NO.1: Split every capital letters
func1 = lambda c: re.search('[A-Z]', c) is not None

# heuristic NO.2: Split every special character
func2 = lambda c: re.search(r'[!"#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~]', c) is not None

# heuristic NO.3: Split every stopword
stopwords_set = set(stopwords.words('english'))
func3 = lambda c: any(word in stopwords_set for word in c.split())

In [7]:
def split_set(cells, func):
    c1 = [c for c in cells if func(c)]
    c2 = [c for c in cells if not func(c)]
    return c1, c2

In [8]:
def parse_template(template, row):
    try:
        return template.parse(row)
    except ValueError:
        pass

In [9]:
def get_pattern(rows, func, depth=0, max_depth=5):
    if depth > max_depth:
        return []

    template = templater3.Templater(min_block_size=2)
    for row in rows:
        template.learn(row)

    parsed = [template.parse(row) for row in rows]
    columns = list(zip(*parsed))

    for idx, column in enumerate(columns):
        if not any(column):
            template._template[idx*2] = ''
            continue

        splits = split_set(column, func)

        if any(not c for c in splits):
            template._template[idx*2] = ''
        try:
            template._template[idx*2] = tuple([
                get_pattern(c, func, depth=depth+1, max_depth=max_depth)
                for c in splits
            ])
        except IndexError:
            pass

    return template._template


In [10]:
def automate(line):
    page, code = line.split('\t')
    rows = code.replace('\\n','\n').splitlines()
    
    distances = [
    [
        Levenshtein.distance(rows[i], rows[j])
        for j in range(len(rows))
    ]
    for i in range(len(rows))]
    
    total_distances = sum(sum(row) for row in distances)
    total_elements = sum(len(row) for row in distances)
    average_distance = total_distances / total_elements
    
    agglomerative = AgglomerativeClustering(n_clusters=None, metric='precomputed', linkage='average', distance_threshold=average_distance)
    clusters = agglomerative.fit_predict(distances)
    
    clustered_rows = {}
    for cluster_id, row in zip(clusters, rows):
        clustered_rows.setdefault(cluster_id, []).append( row )
        
    clean = []
    for k, v in clustered_rows.items():
        if len(v) > 1:
            clean += v
    
    
    results = [get_pattern(rows, func1, max_depth=5), 
          get_pattern(rows, func2, max_depth=5), 
          get_pattern(rows, func3, max_depth=5), 
          get_pattern(clean, func1, max_depth=5), 
          get_pattern(clean, func2, max_depth=5), get_pattern(clean, func3, max_depth=5)]
    result_series = pd.Series(results, index=['N_cap', 'N_special', 'N_stop', 'C_cap', 'C_special', 'C_stop'])

    return result_series

# Issue with executing automation

In [11]:
# Create an empty dataframe with the desired column names
results_df = pd.DataFrame(columns=['N_cap', 'N_special', 'N_stop', 'C_cap', 'C_special', 'C_stop'])


In [12]:
for line in lines:
    result_series = automate(line)
    results_df = results_df.append(result_series, ignore_index=True)

AttributeError: 'DataFrame' object has no attribute 'append'

In [13]:
import pandas as pd
print(pd.__version__)

2.0.2


In [None]:
for line in lines:
    automate(line)