In [1]:
from db.database import Database
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
from db.models import Post, Topic, Blueprint
from lang_identification import identify_language
import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
db = Database()
topics = {topic.topic_id: topic for topic in db.get_topics()}
posts = {post.post_id: post for post in db.get_posts()}
blueprints = {bp.id: bp for bp in db.get_all_blueprints()}
topics.__len__(), posts.__len__(), blueprints.__len__()

Loading blueprints: 100%|██████████| 2228/2228 [00:01<00:00, 1473.22it/s]


(1657, 38517, 2228)

In [3]:

non_english = {} 
for bp in tqdm.tqdm(blueprints.values(), desc="Processing Blueprints"):
    lang = identify_language(bp)
    if lang != 'en':
        non_english[bp] = lang

Processing Blueprints: 100%|██████████| 2228/2228 [00:37<00:00, 59.94it/s]


In [4]:
non_english.keys().__len__()

37

In [5]:
for bp in non_english:
    blueprints.pop(bp.id)
blueprints.__len__()

2191

In [6]:
groups = db.get_blueprints_per_topic()
groups.keys().__len__()

1211

In [7]:
_totallength = [group.__len__() for group in groups.values()]
sum(_totallength)

2228

In [8]:
english_ids = set(blueprints.keys())   
filtered_groups = {
    topic_id: [bp for bp in bps if bp.id in english_ids]
    for topic_id, bps in groups.items()
}
groups.keys().__len__()

1211

In [9]:
_totallength = [group.__len__() for group in filtered_groups.values()]
sum(_totallength)

2191

In [10]:
real_groups = {k: v for k, v in filtered_groups.items() if len(v) > 1}
real_groups.values()

dict_values([[<db.models.Blueprint object at 0x000001CD07F87050>, <db.models.Blueprint object at 0x000001CCF6C05DC0>], [<db.models.Blueprint object at 0x000001CD07F871D0>, <db.models.Blueprint object at 0x000001CD07F87200>], [<db.models.Blueprint object at 0x000001CD07F875C0>, <db.models.Blueprint object at 0x000001CD07F875F0>, <db.models.Blueprint object at 0x000001CD07F87620>], [<db.models.Blueprint object at 0x000001CD07F87680>, <db.models.Blueprint object at 0x000001CD07F876B0>], [<db.models.Blueprint object at 0x000001CD07F878F0>, <db.models.Blueprint object at 0x000001CD07F87920>], [<db.models.Blueprint object at 0x000001CD07F879B0>, <db.models.Blueprint object at 0x000001CD07F87A10>], [<db.models.Blueprint object at 0x000001CD07F87AD0>, <db.models.Blueprint object at 0x000001CD07F87B30>], [<db.models.Blueprint object at 0x000001CD07F87D70>, <db.models.Blueprint object at 0x000001CD07F87DD0>], [<db.models.Blueprint object at 0x000001CD07F87F50>, <db.models.Blueprint object at 0x0

In [11]:
len(real_groups.keys())

460

In [13]:
posts_dict = {post.post_id: post for post in posts.values()}

In [14]:
scores_dict = {}
for topic_id, bp_group in real_groups.items():
    _scores = []
    for bp in bp_group:
        _scores.append({bp.id: posts_dict[bp.post_id].score})

    scores_dict[topic_id] = _scores
scores_dict

{'256469': [{9: 1518.8}, {10: 1518.8}],
 '257765': [{14: 7765.8}, {15: 7765.8}],
 '259767': [{25: 12325.4}, {26: 12325.4}, {27: 12325.4}],
 '260727': [{28: 2332}, {29: 2332}],
 '253977': [{35: 9875.6}, {36: 195.4}],
 '265505': [{37: 2570.8}, {38: 2570.8}],
 '255908': [{39: 2692.6}, {40: 39.8}],
 '269117': [{43: 6099.8}, {44: 6099.8}],
 '256311': [{46: 5532.6}, {47: 125}, {48: 29.4}, {49: 56}],
 '257141': [{56: 2103.4}, {57: 52.2}],
 '286465': [{78: 487.4}, {79: 13}],
 '305624': [{96: 1882.4}, {97: 1882.4}],
 '294217': [{102: 6428}, {103: 6428}],
 '280125': [{106: 15128.4}, {107: 544.2}, {108: 171.4}],
 '286565': [{112: 1936.8}, {113: 55.2}],
 '260725': [{115: 2739.4}, {116: 2739.4}],
 '291907': [{118: 193.4}, {119: 193.4}],
 '255773': [{121: 946}, {122: 946}],
 '313571': [{130: 914.4}, {131: 914.4}],
 '340021': [{135: 2495}, {136: 2495}],
 '287064': [{139: 2744.4}, {140: 2744.4}],
 '258591': [{145: 5644.8}, {146: 5644.8}],
 '255456': [{160: 215.6}, {161: 98}],
 '356870': [{167: 2878.6}

In [15]:
scores_dict.__len__()

460

In [16]:
from util.text_manipulation import normalize_text
from util.text_manipulation import parse_yaml

def normalize(obj):
        if isinstance(obj, dict):
            return {k: normalize(v) for k, v in sorted(obj.items())}
        elif isinstance(obj, list):
            return [normalize(v) for v in obj]
        else:
            return normalize_text(str(obj))

def load_and_normalize_from_topic_id(topic_id, bps=None):
    if(bps):
        return [normalize(parse_yaml(bp.blueprint_code)) for bp in bps]
    
    topic_posts = db.get_posts_by_topic_id(topic_id)
    topic_bps = [db.get_blueprints_by_post_id(post.post_id) for post in topic_posts]
    topic_bps = [bp for sublist in topic_bps for bp in sublist]
    normalized_codes = [normalize(parse_yaml(bp.blueprint_code)) for bp in topic_bps]
    return normalized_codes

In [None]:
from deepdiff import DeepDiff
def structural_diff(code1, code2):
    diff = DeepDiff(code1, code2, ignore_order=True)
    diff_size = len(str(diff))
    total_size = len(str(code1)) + len(str(code2))
    return diff, 1 - diff_size / total_size

In [18]:
# 294217 - no diff, 256469 - slight diff, 941731 - 4 different with larger diffs
normalized_codes = load_and_normalize_from_topic_id(294217)
structural_diff(normalized_codes[0], normalized_codes[1])

({}, 0.9996041171813144)

In [None]:
def compare_multiple_bps(normalized_codes) -> list[list]:
    comparison = []
    for i in range(len(normalized_codes)):
        for j in range(i + 1, len(normalized_codes)):
            _, similarity = structural_diff(normalized_codes[i], normalized_codes[j])
            comparison.append(similarity)
    return comparison

In [20]:
normalized_codes = load_and_normalize_from_topic_id(941731)
res = compare_multiple_bps(normalized_codes)
[item[1] for item in res]

[0.13215573846296125,
 0.08458279609504515,
 -0.12797577451665498,
 0.09563563352002535,
 -0.136950107886741,
 -0.11736573614850498]

In [23]:
tbr = []
limit = 0.8
for topic_id, bps in tqdm.tqdm(real_groups.items(), desc="Comparing Blueprints"):
    normalized_codes = load_and_normalize_from_topic_id(topic_id, bps)
    if bps.__len__() == 2:
        _, _score = structural_diff(normalized_codes[0], normalized_codes[1])
        if _score < limit:
            tbr.append(bps)
    """ else:
        compare_multiple_bps(normalized_codes) """
    
tbr.__len__()

Comparing Blueprints: 100%|██████████| 460/460 [00:27<00:00, 16.48it/s]


117

In [24]:
for bps in tbr:
    blueprints.pop(bps[0].id)
blueprints.__len__()

2074