In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from db.database import Database
from db.models import Blueprint
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
from nltk.corpus import stopwords
from util.lang_identification import identify_language_yaml
from bs4 import BeautifulSoup
import re
from util.text_manipulation import remove_html

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
database_url = f"sqlite:///home_assistant_blueprints.sqlite"
topics_df = pd.read_sql("topics", database_url, columns=["id", "topic_id", "title", "topic_url", "tags"])
posts_df = pd.read_sql("posts", database_url, columns=["id", "post_id", "topic_id", "score"])
blueprints_df = pd.read_sql("blueprints", database_url, columns=["id", "blueprint_code", "post_id", "extracted_keywords", "topic_keywords"])
blueprints_df.head()

Unnamed: 0,id,blueprint_code,post_id,extracted_keywords,topic_keywords
0,1,blueprint:\n name: Nag prompt blueprint\n de...,1220860,{'input__input_boolean': 2},"{'android': 0.04282333988128355, 'user': 0.042..."
1,2,blueprint:\n name: Inovelli \n description: ...,1220678,{},"{'inovelli': 0.027025502281732554, 'create': 0..."
2,3,blueprint:\n name: deCONZ - IKEA five button ...,1223455,{'input__deconz': 1},"{'short': 0.01649244354775234, 'long': 0.01649..."
3,4,blueprint:\n name: Heat for certain time\n d...,1223628,"{'input__input_datetime': 1, 'input__climate': 2}","{'set': 0.055741499059047184, 'temperature': 0..."
4,5,blueprint:\n name: Light Allowance\n descrip...,1224871,"{'input__light': 1, 'output__light': 1}","{'time': 0.11369206228658875, 'light': 0.13453..."


In [4]:
_bf = blueprints_df.__len__()
for index, row in blueprints_df.iterrows():
    code = row["blueprint_code"]
    if identify_language_yaml(code) != "en":
        blueprints_df.drop(index, inplace=True)
blueprints_df.reset_index(drop=True, inplace=True)
_af = blueprints_df.__len__()
_bf, _af

(2228, 2191)

In [5]:
def preprocessing(text, ignorable_words=None):
    if ignorable_words is None:
        ignorable_words = []
    ignorable_words = ignorable_words + ["blueprint", "automation", "entity", "work"]
    text = remove_html(text)
    lemmatizer = WordNetLemmatizer()
    text = text.lower()
    text = re.sub(r"â€™", r"'", text)
    text = re.sub(r"[^\w'\s]", "", text)
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords.words("english")]
    text = " ".join(text)
    text = re.sub("|".join(ignorable_words), "", text, flags=re.IGNORECASE)
    return text

In [13]:
_topic_post_df = topics_df.merge(posts_df, left_on="topic_id", right_on="topic_id")
_topic_post_bps_df = _topic_post_df.merge(blueprints_df, left_on="post_id", right_on="post_id")
_topic_post_bps_df.head()

Unnamed: 0,id_x,topic_id,title,topic_url,tags,id_y,post_id,score,id,blueprint_code,extracted_keywords,topic_keywords
0,2,255041,Nag prompt Blueprint (Android Notification),https://community.home-assistant.io//t/nag-pro...,[],3,1220860,703,1,blueprint:\n name: Nag prompt blueprint\n de...,{'input__input_boolean': 2},"{'android': 0.04282333988128355, 'user': 0.042..."
1,4,254999,Inovelli LZW36 Fan/Dimmer Scenes,https://community.home-assistant.io//t/inovell...,"[""blueprint""]",5,1220678,1014,2,blueprint:\n name: Inovelli \n description: ...,{},"{'inovelli': 0.027025502281732554, 'create': 0..."
2,5,255699,deCONZ - IKEA five button remote,https://community.home-assistant.io//t/deconz-...,"[""switch"", ""blueprint"", ""deconz""]",11,1223455,976,3,blueprint:\n name: deCONZ - IKEA five button ...,{'input__deconz': 1},"{'short': 0.01649244354775234, 'long': 0.01649..."
3,6,255742,Set heating temperature to a configurable valu...,https://community.home-assistant.io//t/set-hea...,[],12,1223628,482,4,blueprint:\n name: Heat for certain time\n d...,"{'input__input_datetime': 1, 'input__climate': 2}","{'set': 0.055741499059047184, 'temperature': 0..."
4,7,256045,Light Allowance,https://community.home-assistant.io//t/light-a...,[],13,1224871,136,5,blueprint:\n name: Light Allowance\n descrip...,"{'input__light': 1, 'output__light': 1}","{'time': 0.11369206228658875, 'light': 0.13453..."


In [None]:
from util.structural_diff import compare_multiple_bps, structural_diff, load_and_normalize_from_topic_id
unique_topic_ids = _topic_post_bps_df["topic_id"].unique()
for topic_id in unique_topic_ids:
    topic_subset = _topic_post_bps_df[_topic_post_bps_df["topic_id"] == topic_id]
    topic_subset_bps = topic_subset["blueprint_code"].tolist()
    

Topic ID: 255041, Number of Blueprints: 1
Topic ID: 254999, Number of Blueprints: 1
Topic ID: 255699, Number of Blueprints: 1
Topic ID: 255742, Number of Blueprints: 1
Topic ID: 256045, Number of Blueprints: 1
Topic ID: 256779, Number of Blueprints: 1
Topic ID: 255778, Number of Blueprints: 1
Topic ID: 255382, Number of Blueprints: 1
Topic ID: 256469, Number of Blueprints: 2
Topic ID: 257215, Number of Blueprints: 1
Topic ID: 257319, Number of Blueprints: 1
Topic ID: 256413, Number of Blueprints: 1
Topic ID: 257765, Number of Blueprints: 2
Topic ID: 256240, Number of Blueprints: 1
Topic ID: 258706, Number of Blueprints: 1
Topic ID: 258328, Number of Blueprints: 1
Topic ID: 255730, Number of Blueprints: 1
Topic ID: 256430, Number of Blueprints: 1
Topic ID: 259448, Number of Blueprints: 1
Topic ID: 260724, Number of Blueprints: 1
Topic ID: 258309, Number of Blueprints: 1
Topic ID: 255324, Number of Blueprints: 1
Topic ID: 259767, Number of Blueprints: 3
Topic ID: 260727, Number of Bluepr