In [1]:
from db.database import Database
from sklearn.cluster import KMeans
import numpy as np

In [2]:
db = Database()
topics = {topic.topic_id: topic for topic in db.get_topics()}
posts = {post.post_id: post for post in db.get_posts()}
blueprints = {bp.id: bp for bp in db.get_all_blueprints()}

Loading blueprints: 100%|██████████| 2232/2232 [00:02<00:00, 802.39it/s]


In [3]:
import tqdm
from util.lang_identification import identify_language_yaml

non_english = {} 
for bp in tqdm.tqdm(blueprints.values(), desc="Detecting languages"):
    lang = identify_language_yaml(bp.blueprint_code)
    if lang != "en":
        non_english[bp] = lang
        
for bp in non_english:
    blueprints.pop(bp.id)
blueprints.__len__()

groups = db.get_blueprints_per_topic()

english_ids = set(blueprints.keys())   
filtered_groups = {
    topic_id: [bp for bp in bps if bp.id in english_ids]
    for topic_id, bps in groups.items()
}
filtered_groups.keys().__len__()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Adrian\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Detecting languages: 100%|██████████| 2232/2232 [00:44<00:00, 49.87it/s]


1211

In [4]:
from util.structural_diff import compare_multiple_bps, structural_diff
tbr = []
limit = 0.8
for topic_id, bps in tqdm.tqdm(filtered_groups.items(), desc="Comparing Blueprints"):
    if bps.__len__() == 2:
        _, score = structural_diff(bps[0].blueprint_code, bps[1].blueprint_code)
        if score > limit:
            tbr.append(bps[0])
    if bps.__len__() >= 2:
        scores = compare_multiple_bps(bps)
        sim_groups = []
        
        for bp1, bp2, score in scores:
            if score > limit:
                sim_groups.append((bp1, bp2))
                
        if sim_groups.__len__() > 0:
            flat_sim_groups = set(bp for pair in sim_groups for bp in pair)
            for bp in flat_sim_groups:
                if bp != list(flat_sim_groups)[-1]:
                    tbr.append(bp)

tbr.__len__()

Comparing Blueprints: 100%|██████████| 1211/1211 [01:50<00:00, 11.00it/s]


417

In [5]:
for bp in tbr:
    blueprints.pop(bp.id)
blueprints.__len__()

1779

In [6]:
import pandas as pd
bp_df = pd.DataFrame([{_attr: getattr(bp, _attr) for _attr in bp.__dict__.keys()
                        } for bp in blueprints.values()])
bp_df.head()

Unnamed: 0,_sa_instance_state,blueprint_code,blueprint_url,post_id,description,topic_keywords,keywords_tfidf,blueprint_hash,id,name,extracted_keywords,keywords_yake,post,topic_title,topic_id,tags,created_at,post_content
0,<sqlalchemy.orm.state.InstanceState object at ...,blueprint:\n name: Nag prompt blueprint\n de...,https://community.home-assistant.io//t/nag-pro...,1220860,Nag a mobile device to do something,"{'nag': 0.66021433671546, 'prompt': 0.23634566...","{'nag': 0.8500419399246807, 'prompt': 0.303868...",2f523787adf856fbddd08b2f7b32aa2490eb48a236bc68...,1,Nag prompt blueprint,{'input__input_boolean': 2},"[mobile device, android notification]",<db.models.Post object at 0x000001BE8F222590>,Nag prompt Blueprint (Android Notification),255041,[],2020-12-14 01:40:32.358,<p>This blueprints creates configurable nag no...
1,<sqlalchemy.orm.state.InstanceState object at ...,blueprint:\n name: Inovelli \n description: ...,https://community.home-assistant.io//t/inovell...,1220678,Use this blueprint to create automations based...,"{'inovelli': 0.49695533075954657, 'lzw36': 0.4...","{'lzw36': 0.44023949481589464, 'inovelli': 0.4...",ca6a74b04fac9cee0925adfe7cb03c8beb0354a816fa66...,2,Inovelli,{},"[integration, create]",<db.models.Post object at 0x000001BE8F2EEB90>,Inovelli LZW36 Fan/Dimmer Scenes,254999,"[""blueprint""]",2020-12-13 22:32:38.158,<p>This blueprint allows you to easily create ...
2,<sqlalchemy.orm.state.InstanceState object at ...,blueprint:\n name: deCONZ - IKEA five button ...,https://community.home-assistant.io//t/deconz-...,1223455,Control anything using IKEA five button remote\n,"{'button': 0.47028047688169866, 'five': 0.3549...","{'five': 0.5604759426620674, 'ikea': 0.4331904...",c5f67655e23dff66b1042d3b38e967d74dad01a4a3d589...,3,deCONZ - IKEA five button remote,{'input__deconz': 1},"[button remote, button]",<db.models.Post object at 0x000001BE8ED569D0>,deCONZ - IKEA five button remote,255699,"[""switch"", ""blueprint"", ""deconz""]",2020-12-15 12:18:15.898,"<p>This is a mix of <a href=""https://community..."
3,<sqlalchemy.orm.state.InstanceState object at ...,blueprint:\n name: Heat for certain time\n d...,https://community.home-assistant.io//t/set-hea...,1223628,Turn on heating for a given amount of time.,"{'heat': 0.3484675281776271, 'heating': 0.3360...","{'heat': 0.44290156715634094, 'heating': 0.430...",00b1c4d2e847ed9451c8e8d1acfbbcd5e1740f9991226d...,4,Heat for certain time,"{'input__input_datetime': 1, 'input__climate': 2}","[amount, heating]",<db.models.Post object at 0x000001BE8ED66550>,Set heating temperature to a configurable valu...,255742,[],2020-12-15 14:23:53.414,"<p>This is a blueprint, that allows to set the..."
4,<sqlalchemy.orm.state.InstanceState object at ...,blueprint:\n name: Light Allowance\n descrip...,https://community.home-assistant.io//t/light-a...,1224871,Turns a light off after an allotted time,"{'allotted': 0.6330710810698282, 'allowance': ...","{'allotted': 0.6465282359948884, 'allowance': ...",0dd0ec70d42f2e8a428ae7a2934852dbec0beb34cd03cc...,5,Light Allowance,"{'input__light': 1, 'output__light': 1}","[allotted time, time]",<db.models.Post object at 0x000001BE8EDBA7D0>,Light Allowance,256045,[],2020-12-16 04:23:36.519,<p>This blueprint will turn a light off after ...


In [7]:
bp_df = bp_df.drop(columns=["_sa_instance_state"])
bp_df.head()

Unnamed: 0,blueprint_code,blueprint_url,post_id,description,topic_keywords,keywords_tfidf,blueprint_hash,id,name,extracted_keywords,keywords_yake,post,topic_title,topic_id,tags,created_at,post_content
0,blueprint:\n name: Nag prompt blueprint\n de...,https://community.home-assistant.io//t/nag-pro...,1220860,Nag a mobile device to do something,"{'nag': 0.66021433671546, 'prompt': 0.23634566...","{'nag': 0.8500419399246807, 'prompt': 0.303868...",2f523787adf856fbddd08b2f7b32aa2490eb48a236bc68...,1,Nag prompt blueprint,{'input__input_boolean': 2},"[mobile device, android notification]",<db.models.Post object at 0x000001BE8F222590>,Nag prompt Blueprint (Android Notification),255041,[],2020-12-14 01:40:32.358,<p>This blueprints creates configurable nag no...
1,blueprint:\n name: Inovelli \n description: ...,https://community.home-assistant.io//t/inovell...,1220678,Use this blueprint to create automations based...,"{'inovelli': 0.49695533075954657, 'lzw36': 0.4...","{'lzw36': 0.44023949481589464, 'inovelli': 0.4...",ca6a74b04fac9cee0925adfe7cb03c8beb0354a816fa66...,2,Inovelli,{},"[integration, create]",<db.models.Post object at 0x000001BE8F2EEB90>,Inovelli LZW36 Fan/Dimmer Scenes,254999,"[""blueprint""]",2020-12-13 22:32:38.158,<p>This blueprint allows you to easily create ...
2,blueprint:\n name: deCONZ - IKEA five button ...,https://community.home-assistant.io//t/deconz-...,1223455,Control anything using IKEA five button remote\n,"{'button': 0.47028047688169866, 'five': 0.3549...","{'five': 0.5604759426620674, 'ikea': 0.4331904...",c5f67655e23dff66b1042d3b38e967d74dad01a4a3d589...,3,deCONZ - IKEA five button remote,{'input__deconz': 1},"[button remote, button]",<db.models.Post object at 0x000001BE8ED569D0>,deCONZ - IKEA five button remote,255699,"[""switch"", ""blueprint"", ""deconz""]",2020-12-15 12:18:15.898,"<p>This is a mix of <a href=""https://community..."
3,blueprint:\n name: Heat for certain time\n d...,https://community.home-assistant.io//t/set-hea...,1223628,Turn on heating for a given amount of time.,"{'heat': 0.3484675281776271, 'heating': 0.3360...","{'heat': 0.44290156715634094, 'heating': 0.430...",00b1c4d2e847ed9451c8e8d1acfbbcd5e1740f9991226d...,4,Heat for certain time,"{'input__input_datetime': 1, 'input__climate': 2}","[amount, heating]",<db.models.Post object at 0x000001BE8ED66550>,Set heating temperature to a configurable valu...,255742,[],2020-12-15 14:23:53.414,"<p>This is a blueprint, that allows to set the..."
4,blueprint:\n name: Light Allowance\n descrip...,https://community.home-assistant.io//t/light-a...,1224871,Turns a light off after an allotted time,"{'allotted': 0.6330710810698282, 'allowance': ...","{'allotted': 0.6465282359948884, 'allowance': ...",0dd0ec70d42f2e8a428ae7a2934852dbec0beb34cd03cc...,5,Light Allowance,"{'input__light': 1, 'output__light': 1}","[allotted time, time]",<db.models.Post object at 0x000001BE8EDBA7D0>,Light Allowance,256045,[],2020-12-16 04:23:36.519,<p>This blueprint will turn a light off after ...


In [8]:
bp_df = bp_df.drop(columns=["post"])
bp_df.head()

Unnamed: 0,blueprint_code,blueprint_url,post_id,description,topic_keywords,keywords_tfidf,blueprint_hash,id,name,extracted_keywords,keywords_yake,topic_title,topic_id,tags,created_at,post_content
0,blueprint:\n name: Nag prompt blueprint\n de...,https://community.home-assistant.io//t/nag-pro...,1220860,Nag a mobile device to do something,"{'nag': 0.66021433671546, 'prompt': 0.23634566...","{'nag': 0.8500419399246807, 'prompt': 0.303868...",2f523787adf856fbddd08b2f7b32aa2490eb48a236bc68...,1,Nag prompt blueprint,{'input__input_boolean': 2},"[mobile device, android notification]",Nag prompt Blueprint (Android Notification),255041,[],2020-12-14 01:40:32.358,<p>This blueprints creates configurable nag no...
1,blueprint:\n name: Inovelli \n description: ...,https://community.home-assistant.io//t/inovell...,1220678,Use this blueprint to create automations based...,"{'inovelli': 0.49695533075954657, 'lzw36': 0.4...","{'lzw36': 0.44023949481589464, 'inovelli': 0.4...",ca6a74b04fac9cee0925adfe7cb03c8beb0354a816fa66...,2,Inovelli,{},"[integration, create]",Inovelli LZW36 Fan/Dimmer Scenes,254999,"[""blueprint""]",2020-12-13 22:32:38.158,<p>This blueprint allows you to easily create ...
2,blueprint:\n name: deCONZ - IKEA five button ...,https://community.home-assistant.io//t/deconz-...,1223455,Control anything using IKEA five button remote\n,"{'button': 0.47028047688169866, 'five': 0.3549...","{'five': 0.5604759426620674, 'ikea': 0.4331904...",c5f67655e23dff66b1042d3b38e967d74dad01a4a3d589...,3,deCONZ - IKEA five button remote,{'input__deconz': 1},"[button remote, button]",deCONZ - IKEA five button remote,255699,"[""switch"", ""blueprint"", ""deconz""]",2020-12-15 12:18:15.898,"<p>This is a mix of <a href=""https://community..."
3,blueprint:\n name: Heat for certain time\n d...,https://community.home-assistant.io//t/set-hea...,1223628,Turn on heating for a given amount of time.,"{'heat': 0.3484675281776271, 'heating': 0.3360...","{'heat': 0.44290156715634094, 'heating': 0.430...",00b1c4d2e847ed9451c8e8d1acfbbcd5e1740f9991226d...,4,Heat for certain time,"{'input__input_datetime': 1, 'input__climate': 2}","[amount, heating]",Set heating temperature to a configurable valu...,255742,[],2020-12-15 14:23:53.414,"<p>This is a blueprint, that allows to set the..."
4,blueprint:\n name: Light Allowance\n descrip...,https://community.home-assistant.io//t/light-a...,1224871,Turns a light off after an allotted time,"{'allotted': 0.6330710810698282, 'allowance': ...","{'allotted': 0.6465282359948884, 'allowance': ...",0dd0ec70d42f2e8a428ae7a2934852dbec0beb34cd03cc...,5,Light Allowance,"{'input__light': 1, 'output__light': 1}","[allotted time, time]",Light Allowance,256045,[],2020-12-16 04:23:36.519,<p>This blueprint will turn a light off after ...


In [9]:
import json
bp_df["keywords_tfidf"] = bp_df["keywords_tfidf"].apply(lambda x: json.dumps(x) if x is not None else None)
bp_df["keywords_yake"] = bp_df["keywords_yake"].apply(lambda x: json.dumps(x) if x is not None else None)
bp_df["extracted_keywords"] = bp_df["extracted_keywords"].apply(lambda x: json.dumps(x) if x is not None else None)
bp_df["topic_keywords"] = bp_df["topic_keywords"].apply(lambda x: json.dumps(x) if x is not None else None)
bp_df.head()

Unnamed: 0,blueprint_code,blueprint_url,post_id,description,topic_keywords,keywords_tfidf,blueprint_hash,id,name,extracted_keywords,keywords_yake,topic_title,topic_id,tags,created_at,post_content
0,blueprint:\n name: Nag prompt blueprint\n de...,https://community.home-assistant.io//t/nag-pro...,1220860,Nag a mobile device to do something,"{""nag"": 0.66021433671546, ""prompt"": 0.23634566...","{""nag"": 0.8500419399246807, ""prompt"": 0.303868...",2f523787adf856fbddd08b2f7b32aa2490eb48a236bc68...,1,Nag prompt blueprint,"{""input__input_boolean"": 2}","[""mobile device"", ""android notification""]",Nag prompt Blueprint (Android Notification),255041,[],2020-12-14 01:40:32.358,<p>This blueprints creates configurable nag no...
1,blueprint:\n name: Inovelli \n description: ...,https://community.home-assistant.io//t/inovell...,1220678,Use this blueprint to create automations based...,"{""inovelli"": 0.49695533075954657, ""lzw36"": 0.4...","{""lzw36"": 0.44023949481589464, ""inovelli"": 0.4...",ca6a74b04fac9cee0925adfe7cb03c8beb0354a816fa66...,2,Inovelli,{},"[""integration"", ""create""]",Inovelli LZW36 Fan/Dimmer Scenes,254999,"[""blueprint""]",2020-12-13 22:32:38.158,<p>This blueprint allows you to easily create ...
2,blueprint:\n name: deCONZ - IKEA five button ...,https://community.home-assistant.io//t/deconz-...,1223455,Control anything using IKEA five button remote\n,"{""button"": 0.47028047688169866, ""five"": 0.3549...","{""five"": 0.5604759426620674, ""ikea"": 0.4331904...",c5f67655e23dff66b1042d3b38e967d74dad01a4a3d589...,3,deCONZ - IKEA five button remote,"{""input__deconz"": 1}","[""button remote"", ""button""]",deCONZ - IKEA five button remote,255699,"[""switch"", ""blueprint"", ""deconz""]",2020-12-15 12:18:15.898,"<p>This is a mix of <a href=""https://community..."
3,blueprint:\n name: Heat for certain time\n d...,https://community.home-assistant.io//t/set-hea...,1223628,Turn on heating for a given amount of time.,"{""heat"": 0.3484675281776271, ""heating"": 0.3360...","{""heat"": 0.44290156715634094, ""heating"": 0.430...",00b1c4d2e847ed9451c8e8d1acfbbcd5e1740f9991226d...,4,Heat for certain time,"{""input__input_datetime"": 1, ""input__climate"": 2}","[""amount"", ""heating""]",Set heating temperature to a configurable valu...,255742,[],2020-12-15 14:23:53.414,"<p>This is a blueprint, that allows to set the..."
4,blueprint:\n name: Light Allowance\n descrip...,https://community.home-assistant.io//t/light-a...,1224871,Turns a light off after an allotted time,"{""allotted"": 0.6330710810698282, ""allowance"": ...","{""allotted"": 0.6465282359948884, ""allowance"": ...",0dd0ec70d42f2e8a428ae7a2934852dbec0beb34cd03cc...,5,Light Allowance,"{""input__light"": 1, ""output__light"": 1}","[""allotted time"", ""time""]",Light Allowance,256045,[],2020-12-16 04:23:36.519,<p>This blueprint will turn a light off after ...


In [10]:
bp_df.to_sql("blueprints_filtered", db.engine.connect(), if_exists="replace", index=False)
db.engine.dispose()

### kmeans test

In [None]:
from sklearn.preprocessing import StandardScaler

# Ensure consistent feature lengths
def extract_features(row, tfidf_size=2, yake_size=2):
    # Extract TF-IDF features and pad/truncate to fixed size
    tfidf_features = list(row["keywords_tfidf"].values())
    tfidf_features = tfidf_features[:tfidf_size] + [0] * (tfidf_size - len(tfidf_features))
    
    # Extract YAKE features and pad/truncate to fixed size
    yake_features = list(row["keywords_yake"].values())
    yake_features = yake_features[:yake_size] + [0] * (yake_size - len(yake_features))
    
    # Combine TF-IDF and YAKE features
    combined_features = tfidf_features + yake_features
    return combined_features

# Apply the updated function
bp_df["features"] = bp_df.apply(extract_features, axis=1)

# Create a feature matrix
X = np.vstack(bp_df["features"].values)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply KMeans clustering
kmeans = KMeans(random_state=42)
bp_df["cluster"] = kmeans.fit_predict(X_scaled)

# Analyze the clusters
print(bp_df[["id", "cluster"]].head())

In [None]:
def visualize_clusters(X, labels):
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA

    pca = PCA(n_components=2)
    X_reduced = pca.fit_transform(X)

    plt.figure(figsize=(10, 7))
    scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=labels, cmap='viridis', alpha=0.7)
    plt.title("KMeans Clustering of Blueprints")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.colorbar(scatter, label="Cluster Label")
    plt.show()

visualize_clusters(X_scaled, bp_df["cluster"])  

In [None]:
cluster_1 = bp_df[bp_df["cluster"] == 3]
cluster_1.head()

### embeddings

In [None]:
import re
def process_bp_keywords(kwd_dict: dict[str, int]) -> str | None:
    kwd_list = list(kwd_dict.keys())
    if kwd_list.__len__() < 1:
        return ""
    
    concat_kwds = ""
    
    for kwd in kwd_list:
        in_out = re.search(r"(input__|output__)", kwd)
        kwd = kwd.removeprefix(in_out.group()) if in_out else kwd
        concat_kwds += f"{in_out.group().replace('__', '')}: {kwd}; "
    return concat_kwds

In [None]:
sents = []
for row in bp_df.itertuples():
    kwds = process_bp_keywords(row.extracted_keywords)
        
    sent = f"{kwds}tfidf: {list(row.keywords_tfidf.keys())}; yake: {list(row.keywords_yake.keys())}"
    
    sents.append(sent)
bp_df["sents"] = sents
bp_df.head()

In [None]:
import sentence_transformers
model = sentence_transformers.SentenceTransformer("all-MiniLM-L6-v2")
embeddings = []
for row in tqdm.tqdm(bp_df.itertuples(), total=bp_df.shape[0], desc="Embedding sentences"):
    embedding = model.encode(row.sents)
    embeddings.append(embedding)
bp_df["embeddings"] = embeddings

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
embedding_matrix = np.vstack(bp_df["embeddings"].values)
sil_score = silhouette_score(embedding_matrix, bp_df["cluster"])