# Libraries

In [1]:
import numpy as np
import Levenshtein
from sklearn.cluster import AgglomerativeClustering
import templater3
import re
from nltk.corpus import stopwords
import pandas as pd

# Dataset

In [2]:
line = open('simplewiki-20211120-lists-1k.tsv').readlines()[0]
lines = open('simplewiki-20211120-lists-1k.tsv').readlines()
    
page, code = line.split('\t')
rows = code.replace('\\n','\n').splitlines()


## Attempt to delete things in dataset to fix error

# Data Preperation
## Clustering

In [3]:
# Levenshtein distance between cross-product combination of items in row
distances = [
    [
        Levenshtein.distance(rows[i], rows[j])
        for j in range(len(rows))
    ]
    for i in range(len(rows))
]

In [4]:
# average distance of distances and use that as threshold
total_distances = sum(sum(row) for row in distances)
total_elements = sum(len(row) for row in distances)
average_distance = total_distances / total_elements

In [5]:
agg = AgglomerativeClustering(n_clusters=None, metric='precomputed', linkage='average', distance_threshold=average_distance)
clusters = agg.fit_predict(distances)

In [6]:
cluster_rows = {}
for cluster_id, row in zip(clusters, rows):
    cluster_rows.setdefault(cluster_id, []).append( row )

In [7]:
cls = []
for k, v in cluster_rows.items():
    if len(v) > 1:
        cls += v

## Two datasets

In [8]:
# clustered dataset
clean = [item.replace('\\n', '\n') for item in cls]

In [9]:
# without clustering
norm = code.replace('\\n', '\n').splitlines()

# Info extracting

In [10]:
# Recursive function
def split_set(cells, func):
    c1 = [c for c in cells if func(c)]
    c2 = [c for c in cells if not func(c)]
    return c1, c2

In [11]:
def parse_template(template, row):
    try:
        return template.parse(row)
    except ValueError:
        pass

In [12]:
def get_pattern(rows, func, depth=0):
    # Learn template
    template = templater3.Templater(min_block_size=2)
    for row in rows:
        template.learn(row)
        
    # Parse rows using template to make columns = transposing
    parsed = parse_template(template, row)
    
    columns = list(zip(*parsed))
    
    for idx, column in enumerate(columns):
        if not any(column):
            template._template[idx*2] = ''
            continue
        
        splits = split_set(column, func)
        
        if any(not c for c in splits):
            template._template[idx*2] = ''
        try:
            template._template[idx*2] = tuple([
                get_pattern(c, func, depth=depth+1)
                for c in splits
            ])
        except IndexError:
            pass
            
    return template._template

## Heuristics

In [13]:
# heuristic NO.1: Split every capital letters
func1 = lambda c: re.search('[A-Z]', c) is not None

# heuristic NO.2: Split every special character
func2 = lambda c: re.search(r'[!"#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~]', c) is not None

# heuristic NO.3: Split every stopword
stopwords_set = set(stopwords.words('english'))
func3 = lambda c: any(word in stopwords_set for word in c.split())

## Check outputs

In [14]:
get_pattern(norm, func1)

[None, '* [[April ', None, ']] - ', None]

In [15]:
get_pattern(norm, func2)

[None, '* [[April ', None, ']] - ', None]

In [16]:
get_pattern(norm, func3)

[None, '* [[April ', None, ']] - ', None]

In [17]:
get_pattern(clean, func1)

[None, '* [[April ', None, ']] - ', None]

In [18]:
get_pattern(clean, func2)

[None, '* [[April ', None, ']] - ', None]

In [19]:
get_pattern(clean, func3)

[None, '* [[April ', None, ']] - ', None]

# Automate info extract

In [20]:
def automate(line):
    page, code = line.split('\t')
    rows = code.replace('\\n','\n').splitlines()
    
    distances = [
    [
        Levenshtein.distance(rows[i], rows[j])
        for j in range(len(rows))
    ]
    for i in range(len(rows))]
    
    total_distances = sum(sum(row) for row in distances)
    total_elements = sum(len(row) for row in distances)
    average_distance = total_distances / total_elements
    
    agglomerative = AgglomerativeClustering(n_clusters=None, metric='precomputed', linkage='average', distance_threshold=average_distance)
    clusters = agglomerative.fit_predict(distances)
    
    clustered_rows = {}
    for cluster_id, row in zip(clusters, rows):
        clustered_rows.setdefault(cluster_id, []).append( row )
        
    cls = []
    for k, v in clustered_rows.items():
        if len(v) > 1:
            cls += v
    
    norm = code.replace('\\n', '\n').splitlines()
    clean = [item.replace('\\n', '\n') for item in cls]    
    
    get_pattern(norm, func1)
    get_pattern(norm, func2)
    get_pattern(norm, func3)
    get_pattern(clean, func1)
    get_pattern(clean, func2)
    get_pattern(clean, func3)
    print()

# Issue with executing automation

In [None]:
import sys

sys.setrecursionlimit(10000)  # Set a higher recursion limit


for line in lines:
    automate(line)


