In [1]:
from db.database import Database
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
from db.models import Post, Topic, Blueprint
from lang_identification import identify_language
import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Adrian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
db = Database()
topics = {topic.topic_id: topic for topic in db.get_topics()}
posts = {post.post_id: post for post in db.get_posts()}
blueprints = {bp.id: bp for bp in db.get_all_blueprints()}
topics.__len__(), posts.__len__(), blueprints.__len__()

Loading blueprints: 100%|██████████| 2232/2232 [00:01<00:00, 1418.96it/s]


(1650, 46060, 2232)

In [3]:

non_english = {} 
for bp in tqdm.tqdm(blueprints.values(), desc="Detecting languages"):
    lang = identify_language(bp)
    if lang != 'en':
        non_english[bp] = lang

Detecting languages: 100%|██████████| 2232/2232 [00:38<00:00, 57.46it/s]


In [4]:
non_english.keys().__len__()

36

In [5]:
for bp in non_english:
    blueprints.pop(bp.id)
blueprints.__len__()

2196

In [6]:
groups = db.get_blueprints_per_topic()
groups.keys().__len__()

1211

In [7]:
_totallength = [group.__len__() for group in groups.values()]
sum(_totallength)

2232

In [8]:
english_ids = set(blueprints.keys())   
filtered_groups = {
    topic_id: [bp for bp in bps if bp.id in english_ids]
    for topic_id, bps in groups.items()
}
filtered_groups.keys().__len__()

1211

In [9]:
_totallength = [group.__len__() for group in filtered_groups.values()]
sum(_totallength)

2196

In [10]:
real_groups = {k: v for k, v in filtered_groups.items() if len(v) > 1}
real_groups.values()

dict_values([[<db.models.Blueprint object at 0x0000021C4D384910>, <db.models.Blueprint object at 0x0000021C5CA11FD0>], [<db.models.Blueprint object at 0x0000021C4D367B10>, <db.models.Blueprint object at 0x0000021C4D364210>], [<db.models.Blueprint object at 0x0000021C54A00810>, <db.models.Blueprint object at 0x0000021C54A03610>, <db.models.Blueprint object at 0x0000021C549B95D0>], [<db.models.Blueprint object at 0x0000021C549C5E10>, <db.models.Blueprint object at 0x0000021C549DC590>], [<db.models.Blueprint object at 0x0000021C5CA27590>, <db.models.Blueprint object at 0x0000021C5CA26C10>], [<db.models.Blueprint object at 0x0000021C4D368D50>, <db.models.Blueprint object at 0x0000021C549E1B50>], [<db.models.Blueprint object at 0x0000021C5C9E4250>, <db.models.Blueprint object at 0x0000021C5C9E4E10>], [<db.models.Blueprint object at 0x0000021C55871FD0>, <db.models.Blueprint object at 0x0000021C55A31850>], [<db.models.Blueprint object at 0x0000021C55A52590>, <db.models.Blueprint object at 0x0

In [11]:
len(real_groups.keys())

456

In [12]:
posts_dict = {post.post_id: post for post in posts.values()}

In [13]:
scores_dict = {}
for topic_id, bp_group in real_groups.items():
    _scores = []
    for bp in bp_group:
        _scores.append({bp.id: posts_dict[bp.post_id].score})

    scores_dict[topic_id] = _scores
scores_dict

{'256469': [{9: 1518.6}, {10: 1518.6}],
 '257765': [{14: 7730.6}, {15: 7730.6}],
 '259767': [{25: 12215}, {26: 12215}, {27: 12215}],
 '260727': [{28: 2177}, {29: 2177}],
 '253977': [{35: 9714.6}, {36: 195.2}],
 '265505': [{37: 2550.6}, {38: 2550.6}],
 '255908': [{39: 2687.6}, {40: 39.8}],
 '269117': [{43: 6069.6}, {44: 6069.6}],
 '256311': [{46: 5507.6}, {47: 125}, {48: 29.4}, {49: 56}],
 '257141': [{56: 2083.2}, {57: 52}],
 '286465': [{78: 482.2}, {79: 12.8}],
 '305624': [{96: 1822.4}, {97: 1822.4}],
 '294217': [{102: 6383}, {103: 6383}],
 '280125': [{106: 14887.8}, {107: 464}, {108: 171.4}],
 '286565': [{112: 1896.6}, {113: 50}],
 '260725': [{115: 2588.4}, {116: 2588.4}],
 '291907': [{118: 188.2}, {119: 188.2}],
 '255773': [{121: 936}, {122: 936}],
 '313571': [{130: 904.2}, {131: 904.2}],
 '340021': [{135: 2494.8}, {136: 2494.8}],
 '287064': [{139: 2744.4}, {140: 2744.4}],
 '258591': [{145: 5569.8}, {146: 5569.8}],
 '255456': [{160: 215.6}, {161: 98}],
 '356870': [{167: 2793.2}, {168

In [14]:
scores_dict.__len__()

456

In [15]:
from util.text_manipulation import normalize_text
from util.text_manipulation import parse_yaml

def normalize(obj):
        if isinstance(obj, dict):
            return {k: normalize(v) for k, v in sorted(obj.items())}
        elif isinstance(obj, list):
            return [normalize(v) for v in obj]
        else:
            return normalize_text(str(obj))

def load_and_normalize_from_topic_id(topic_id=None, bps=None):
    if(bps):
        return [normalize(parse_yaml(bp.blueprint_code)) for bp in bps]
    
    topic_posts = db.get_posts_by_topic_id(topic_id)
    topic_bps = [db.get_blueprints_by_post_id(post.post_id) for post in topic_posts]
    topic_bps = [bp for sublist in topic_bps for bp in sublist]
    normalized_codes = [normalize(parse_yaml(bp.blueprint_code)) for bp in topic_bps]
    return normalized_codes

In [16]:
from deepdiff import DeepDiff
def structural_diff(code1, code2):
    diff = DeepDiff(code1, code2, ignore_order=True)
    diff_size = len(str(diff))
    total_size = len(str(code1)) + len(str(code2))
    return diff, 1 - diff_size / total_size

In [17]:
# 294217 - no diff, 256469 - slight diff, 941731 - 4 different with larger diffs
normalized_codes = load_and_normalize_from_topic_id(294217)
structural_diff(normalized_codes[0], normalized_codes[1])

({}, 0.9996041171813144)

In [18]:
def compare_multiple_bps(bps : list[Blueprint]) -> list[list]:
    normalized_codes = load_and_normalize_from_topic_id(bps=bps)
    comparison = []
    for i in range(len(normalized_codes)):
        for j in range(i + 1, len(normalized_codes)):
            _, similarity = structural_diff(normalized_codes[i], normalized_codes[j])
            comparison.append((bps[i], bps[j], similarity))
    return comparison

In [19]:
bps = real_groups['941731']
res = compare_multiple_bps(bps)
res

[(<db.models.Blueprint at 0x21c55d70150>,
  <db.models.Blueprint at 0x21c55d70450>,
  0.13215573846296125),
 (<db.models.Blueprint at 0x21c55d70150>,
  <db.models.Blueprint at 0x21c55d70710>,
  0.08458279609504515),
 (<db.models.Blueprint at 0x21c55d70150>,
  <db.models.Blueprint at 0x21c55d708d0>,
  -0.12797577451665498),
 (<db.models.Blueprint at 0x21c55d70450>,
  <db.models.Blueprint at 0x21c55d70710>,
  0.09563563352002535),
 (<db.models.Blueprint at 0x21c55d70450>,
  <db.models.Blueprint at 0x21c55d708d0>,
  -0.136950107886741),
 (<db.models.Blueprint at 0x21c55d70710>,
  <db.models.Blueprint at 0x21c55d708d0>,
  -0.11736573614850498)]

In [20]:
bps = real_groups['272106']
res = compare_multiple_bps(bps)
[(bp1.id, bp2.id, score) for bp1, bp2, score in res if score > 0.8]

[(386, 387, 0.99989910200787),
 (389, 390, 0.9677715621608082),
 (389, 391, 0.9595488466757124),
 (390, 391, 0.9897251477010018)]

In [21]:
tbr = []
limit = 0.8
for topic_id, bps in tqdm.tqdm(filtered_groups.items(), desc="Comparing Blueprints"):
    if bps.__len__() == 2:
        _, score = structural_diff(normalized_codes[0], normalized_codes[1])
        if score > limit:
            tbr.append(bps)
    elif bps.__len__() > 2:
        scores = compare_multiple_bps(bps)
        sim_groups = []
        for bp1, bp2, score in scores:
            if score > limit:
                if sim_groups is None:
                    sim_groups = [(bp1, bp2)]
                elif not any(bp1 in group or bp2 in group for group in sim_groups):
                    sim_groups.append((bp1, bp2))
                elif any(bp1 in group for group in sim_groups):
                    for i, group in enumerate(sim_groups):
                        if bp1 in group:
                            sim_groups[i] = (group[0], bp2)
                elif any(bp2 in group for group in sim_groups):
                    for i, group in enumerate(sim_groups):
                        if bp2 in group:
                            sim_groups[i] = (group[0], bp1)
        for bp1, bp2 in sim_groups:
            tbr.append((bp1, bp2))

tbr.__len__()

Comparing Blueprints: 100%|██████████| 1211/1211 [01:37<00:00, 12.36it/s]


441

In [22]:
for bps in tbr:
    blueprints.pop(bps[0].id)
blueprints.__len__()

1755