In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from db.database import Database
from db.models import Blueprint
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
from nltk.corpus import stopwords
from util.lang_identification import identify_language_yaml
from bs4 import BeautifulSoup
import re
from util.text_manipulation import remove_html

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Adrian\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
database_url = f"sqlite:///home_assistant_blueprints.sqlite"
topics_df = pd.read_sql("topics", database_url, columns=["id", "topic_id", "title", "topic_url", "tags"])
posts_df = pd.read_sql("posts", database_url, columns=["id", "post_id", "topic_id", "score", "cooked"])
blueprints_df = pd.read_sql("blueprints", database_url, columns=["id", "blueprint_code", "post_id", "extracted_keywords", "topic_keywords"])
blueprints_df.head()

Unnamed: 0,id,blueprint_code,post_id,extracted_keywords,topic_keywords
0,1,blueprint:\n name: Nag prompt blueprint\n de...,1220860,{'input__input_boolean': 2},
1,2,blueprint:\n name: Inovelli \n description: ...,1220678,{},
2,3,blueprint:\n name: deCONZ - IKEA five button ...,1223455,{'input__deconz': 1},
3,4,blueprint:\n name: Heat for certain time\n d...,1223628,"{'input__input_datetime': 1, 'input__climate': 2}",
4,5,blueprint:\n name: Light Allowance\n descrip...,1224871,"{'input__light': 1, 'output__light': 1}",


In [3]:
lang_codes = []
for index, row in blueprints_df.iterrows():
    code = row["blueprint_code"]
    lang_code = identify_language_yaml(code)
    lang_codes.append(lang_code)
    
blueprints_df["lang_code"] = lang_codes
blueprints_df.head()

Unnamed: 0,id,blueprint_code,post_id,extracted_keywords,topic_keywords,lang_code
0,1,blueprint:\n name: Nag prompt blueprint\n de...,1220860,{'input__input_boolean': 2},,en
1,2,blueprint:\n name: Inovelli \n description: ...,1220678,{},,en
2,3,blueprint:\n name: deCONZ - IKEA five button ...,1223455,{'input__deconz': 1},,en
3,4,blueprint:\n name: Heat for certain time\n d...,1223628,"{'input__input_datetime': 1, 'input__climate': 2}",,en
4,5,blueprint:\n name: Light Allowance\n descrip...,1224871,"{'input__light': 1, 'output__light': 1}",,en


In [4]:
before_count = blueprints_df.__len__()
blueprints_df = blueprints_df[blueprints_df["lang_code"] == "en"]
after_count = blueprints_df.__len__()
before_count, after_count

(2232, 2196)

In [5]:
def preprocessing(text, ignorable_words=None):
    if ignorable_words is None:
        ignorable_words = []
    ignorable_words = ignorable_words + ["blueprint", "automation", "entity", "work"]
    text = remove_html(text)
    lemmatizer = WordNetLemmatizer()
    text = text.lower()
    text = re.sub(r"’", r"'", text)
    text = re.sub(r"[^\w'\s]", "", text)
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords.words("english")]
    text = " ".join(text)
    text = re.sub("|".join(ignorable_words), "", text, flags=re.IGNORECASE)
    return text

In [6]:
_topic_post_df = topics_df.merge(posts_df, left_on="topic_id", right_on="topic_id")
_topic_post_bps_df = _topic_post_df.merge(blueprints_df, left_on="post_id", right_on="post_id")
_topic_post_bps_df.head()

Unnamed: 0,id_x,topic_id,title,topic_url,tags,id_y,post_id,score,cooked,id,blueprint_code,extracted_keywords,topic_keywords,lang_code
0,2,255041,Nag prompt Blueprint (Android Notification),https://community.home-assistant.io//t/nag-pro...,[],3,1220860,697,<p>This blueprints creates configurable nag no...,1,blueprint:\n name: Nag prompt blueprint\n de...,{'input__input_boolean': 2},,en
1,4,254999,Inovelli LZW36 Fan/Dimmer Scenes,https://community.home-assistant.io//t/inovell...,"[""blueprint""]",5,1220678,994,<p>This blueprint allows you to easily create ...,2,blueprint:\n name: Inovelli \n description: ...,{},,en
2,5,255699,deCONZ - IKEA five button remote,https://community.home-assistant.io//t/deconz-...,"[""switch"", ""blueprint"", ""deconz""]",11,1223455,976,"<p>This is a mix of <a href=""https://community...",3,blueprint:\n name: deCONZ - IKEA five button ...,{'input__deconz': 1},,en
3,6,255742,Set heating temperature to a configurable valu...,https://community.home-assistant.io//t/set-hea...,[],12,1223628,482,"<p>This is a blueprint, that allows to set the...",4,blueprint:\n name: Heat for certain time\n d...,"{'input__input_datetime': 1, 'input__climate': 2}",,en
4,7,256045,Light Allowance,https://community.home-assistant.io//t/light-a...,[],13,1224871,136,<p>This blueprint will turn a light off after ...,5,blueprint:\n name: Light Allowance\n descrip...,"{'input__light': 1, 'output__light': 1}",,en


In [None]:
from util.structural_diff import compare_multiple_bps
import tqdm
db = Database()
unique_topic_ids = _topic_post_bps_df["topic_id"].unique()
threshold = 0.8
tbr = []
for topic_id in tqdm.tqdm(unique_topic_ids, desc="Analyzing topics for similar blueprints"):
    topic_subset = _topic_post_bps_df[_topic_post_bps_df["topic_id"] == topic_id]
    topic_subset_bps = topic_subset["blueprint_code"].tolist()
    if len(topic_subset_bps) >= 2:
        ids = topic_subset["id"].tolist()
        bps = db.get_blueprints_by_ids(ids)
        scores = compare_multiple_bps(bps)
        sim_groups = []
        
        for bp1, bp2, score in scores:
            if score > threshold:
                sim_groups.append((bp1, bp2))
                
        if sim_groups.__len__() > 0:
            flat_sim_groups = set(bp for pair in sim_groups for bp in pair)
            for bp in flat_sim_groups:
                if bp != list(flat_sim_groups)[-1]:
                    bp_id = bp.id
                    tbr.append(bp_id)

_topic_post_bps_df = _topic_post_bps_df[~_topic_post_bps_df["id"].isin(tbr)]
_topic_post_bps_df.head()

Analyzing topics for similar blueprints:   0%|          | 0/1184 [00:00<?, ?it/s]
Loading blueprints: 100%|██████████| 2/2 [00:00<?, ?it/s]

Loading blueprints: 100%|██████████| 2/2 [00:00<?, ?it/s]

Loading blueprints: 100%|██████████| 3/3 [00:00<?, ?it/s]
Analyzing topics for similar blueprints:   2%|▏         | 23/1184 [00:00<00:10, 109.82it/s]
Loading blueprints: 100%|██████████| 2/2 [00:00<?, ?it/s]

Loading blueprints: 100%|██████████| 2/2 [00:00<?, ?it/s]

Loading blueprints: 100%|██████████| 2/2 [00:00<?, ?it/s]

Loading blueprints: 100%|██████████| 2/2 [00:00<00:00, 270.91it/s]
Analyzing topics for similar blueprints:   3%|▎         | 34/1184 [00:00<00:10, 109.01it/s]
Loading blueprints: 100%|██████████| 2/2 [00:00<?, ?it/s]

Loading blueprints: 100%|██████████| 4/4 [00:00<00:00, 252.24it/s]

Loading blueprints: 100%|██████████| 2/2 [00:00<?, ?it/s]
Analyzing topics for similar blueprints:   4%|▍         | 45/1184 [00:00<00:17, 66.90it/s] 
Loading blueprints: 100%|██████████| 

Unnamed: 0,id_x,topic_id,title,topic_url,tags,id_y,post_id,score,id,blueprint_code,extracted_keywords,topic_keywords,lang_code
0,2,255041,Nag prompt Blueprint (Android Notification),https://community.home-assistant.io//t/nag-pro...,[],3,1220860,697,1,blueprint:\n name: Nag prompt blueprint\n de...,{'input__input_boolean': 2},,en
1,4,254999,Inovelli LZW36 Fan/Dimmer Scenes,https://community.home-assistant.io//t/inovell...,"[""blueprint""]",5,1220678,994,2,blueprint:\n name: Inovelli \n description: ...,{},,en
2,5,255699,deCONZ - IKEA five button remote,https://community.home-assistant.io//t/deconz-...,"[""switch"", ""blueprint"", ""deconz""]",11,1223455,976,3,blueprint:\n name: deCONZ - IKEA five button ...,{'input__deconz': 1},,en
3,6,255742,Set heating temperature to a configurable valu...,https://community.home-assistant.io//t/set-hea...,[],12,1223628,482,4,blueprint:\n name: Heat for certain time\n d...,"{'input__input_datetime': 1, 'input__climate': 2}",,en
4,7,256045,Light Allowance,https://community.home-assistant.io//t/light-a...,[],13,1224871,136,5,blueprint:\n name: Light Allowance\n descrip...,"{'input__light': 1, 'output__light': 1}",,en


In [7]:
_topic_post_bps_df.__len__()

2196

In [29]:
groups = {}
for topic in _topic_post_bps_df["topic_id"].unique():
    if _topic_post_bps_df[_topic_post_bps_df["topic_id"] == topic].__len__() > 1:
        groups[topic] = _topic_post_bps_df[_topic_post_bps_df["topic_id"] == topic]["blueprint_code"].tolist()
groups.keys()

dict_keys(['259767', '256311', '257141', '286465', '280125', '255456', '365041', '255878', '385439', '283973', '264183', '274711', '257617', '278447', '325923', '266178', '384292', '255120', '293469', '333950', '360129', '260772', '280639', '359802', '294721', '301019', '387748', '377149', '255880', '303161', '264599', '365496', '270813', '329408', '278457', '272106', '269701', '395834', '341053', '298080', '505809', '441117', '261280', '261885', '256138', '259363', '524435', '280883', '259010', '476095', '350490', '391475', '256630', '255999', '326917', '314233', '255733', '261814', '311091', '381800', '430415', '410062', '491657', '390055', '289816', '281459', '355314', '447543', '449117', '265010', '341354', '366348', '402565', '280891', '261936', '393164', '296528', '277977', '367102', '314943', '256409', '388683', '455427', '257953', '297287', '333870', '263006', '256542', '421215', '284720', '273031', '258904', '275751', '283190', '290561', '256212', '680380', '613989', '337274',

In [32]:
from util.text_manipulation import parse_yaml
for bp_code in groups.get("259767"):
    print(parse_yaml(bp_code))

{'blueprint': {'name': 'Periodic lights', 'description': "Dim and adjust color temperature of lights progressively throughout the evening (RGB bulbs probably won't change temperature). This is indexed to the midpoint between sunset and sunrise and fit to a sin function, so the midpoint will be minimum brightness and midpoint +12hr will be maximum brightness. The midpoint can be offset by a fixed amount if lights are too dim or bright at the desired time.", 'domain': 'automation', 'source_url': 'https://gist.github.com/haberda/f17694969a6de15d75267667b7c955ac', 'input': {'light': {'name': 'Light(s)', 'description': 'The light(s) to control', 'selector': {'entity': {'multiple': True, 'domain': 'light'}}}, 'min_brightness': {'name': 'Minimum Brightness', 'description': 'Minimum brightness of the light(s)', 'default': 1, 'selector': {'number': {'min': 0.0, 'max': 100.0, 'mode': 'slider', 'step': 1.0, 'unit_of_measurement': '%'}}}, 'max_brightness': {'name': 'Maximum Brightness', 'descripti

In [8]:
_topic_post_df.apply(lambda row: preprocessing(row["cooked"]), axis=1)

KeyboardInterrupt: 

In [None]:
texts = []
for topic in _topic_post_df["topic_id"].unique():
    topic_subset = _topic_post_df[_topic_post_df["topic_id"] == topic]
    

Unnamed: 0,id_x,topic_id,title,topic_url,tags,id_y,post_id,score
0,1,253788,About Blueprints,https://community.home-assistant.io//t/about-b...,[],1,1216159,25251
1,1,253788,About Blueprints,https://community.home-assistant.io//t/about-b...,[],2,1216869,1859
2,2,255041,Nag prompt Blueprint (Android Notification),https://community.home-assistant.io//t/nag-pro...,[],3,1220860,697
3,3,252306,[Blueprint] Public gist for blueprint import,https://community.home-assistant.io//t/bluepri...,"[""blueprint""]",4,1210255,1319
4,4,254999,Inovelli LZW36 Fan/Dimmer Scenes,https://community.home-assistant.io//t/inovell...,"[""blueprint""]",5,1220678,994


In [None]:
_topic_post_bps_df.