In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from db.database import Database
from db.models import Blueprint
from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import stopwords
import pandas as pd
import ast
import numpy as np
from collections import Counter

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Adrian\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
db = Database()
topics = {topic.topic_id: topic for topic in db.get_topics()}
posts = {post.post_id: post for post in db.get_posts()}
blueprints = {bp.id: bp for bp in db.get_all_blueprints()}

Loading blueprints: 100%|██████████| 2232/2232 [00:01<00:00, 1442.87it/s]


In [3]:
print("work" in stopwords.words('english'))  # Check if "it" is in the stopwords list

False


In [4]:
import tqdm
from util.lang_identification import identify_language

non_english = {} 
for bp in tqdm.tqdm(blueprints.values(), desc="Detecting languages"):
    lang = identify_language(bp)
    if lang != 'en':
        non_english[bp] = lang
        
for bp in non_english:
    blueprints.pop(bp.id)
blueprints.__len__()

groups = db.get_blueprints_per_topic()

english_ids = set(blueprints.keys())   
filtered_groups = {
    topic_id: [bp for bp in bps if bp.id in english_ids]
    for topic_id, bps in groups.items()
}
filtered_groups.keys().__len__()

Detecting languages: 100%|██████████| 2232/2232 [00:37<00:00, 59.10it/s]


1211

In [6]:
def remove_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    for code_tag in soup.find_all('code', {'class':['lang-auto','lang-yaml']}):
        code_tag.decompose()
    
    for a_tag in soup.find_all('a'):
        a_tag.decompose()
    
    return soup.get_text().replace('\n', ' ').strip()

In [None]:
def preprocessing(text, ignorable_words=None):
    if ignorable_words is None:
        ignorable_words = []
    ignorable_words = ignorable_words + ["blueprint", "automation", "entity", "work"]
    text = remove_html(text)
    lemmatizer = WordNetLemmatizer()
    text = text.lower()
    text = re.sub(r'’', r"'", text)
    text = re.sub(r"[^\w'\s]", '', text)
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)
    text = re.sub('|'.join(ignorable_words), '', text, flags=re.IGNORECASE)
    return text

### sklearn TfIdf

In [8]:
def fit_trans(str: list[str]):
    tfidf = TfidfVectorizer()
    res = tfidf.fit_transform(str)
    return res, tfidf

In [24]:
def get_corpus_and_url(topic_id):
    topic = topics.get(topic_id)
    posts = db.get_posts_by_topic_id(topic_id)
    tags = topic.tags
    if isinstance(tags, str):
        try:
            tags = ast.literal_eval(tags)
            lemmatizer = WordNetLemmatizer()
            tags = [lemmatizer.lemmatize(tag) for tag in tags]
        except (ValueError, SyntaxError):
            tags = [tags]
            lemmatizer = WordNetLemmatizer()
            tags = [lemmatizer.lemmatize(tag) for tag in tags]
    post_contents = [post.cooked for post in posts]
    corpus = [preprocessing(text, ignorable_words=tags) for text in post_contents]
    corpus = " ".join(corpus)
    return corpus, topic.topic_url

corpus, url = get_corpus_and_url(list(topics.keys())[120])
corpus, url

("hi namron 8button switch want use different thing send action example  ing try use 4 different action 4 different thing dim turn turn  se action changed back  processed option lightzigbee_23 wallwitch lightzigbee_24 lamp home can't test look right",
 'https://community.home-assistant.io//t/one-automation-to-manage-all-buttons-on-a-switch/336607')

In [25]:
res, tfidf = fit_trans([corpus])
terms = tfidf.get_feature_names_out()
df = pd.DataFrame(res.toarray(), columns=terms)
print(df)

    8button    action      back       can   changed  different       dim  \
0  0.136083  0.408248  0.136083  0.136083  0.136083   0.408248  0.136083   

    example        hi      home  ...        se      send    switch      test  \
0  0.136083  0.136083  0.136083  ...  0.136083  0.136083  0.136083  0.136083   

      thing       try      turn       use  wallwitch      want  
0  0.272166  0.136083  0.272166  0.272166   0.136083  0.136083  

[1 rows x 29 columns]


In [28]:
def get_top_keywords(tfidf_matrix, feature_names, top_n=5):
    """
    Extract top keywords for each document based on TF-IDF scores.
    
    Args:
        tfidf_matrix: The sparse matrix of TF-IDF scores.
        feature_names: List of feature names (terms).
        top_n: Number of top keywords to extract per document.
    
    Returns:
        List of lists containing top keywords with score for each document.
    """
    top_keywords = []
    for row in tfidf_matrix:
        # Convert sparse row to dense array
        row_array = row.toarray().flatten()
        # Get indices of top N scores
        top_indices = np.argsort(row_array)[-top_n:][::-1]
        # Map indices to feature names
        top_terms = [(feature_names[i], row_array[i]) for i in top_indices]
        top_keywords.append(top_terms)
    return top_keywords

# Get top keywords for each document
top_keywords_per_doc = get_top_keywords(res, terms, top_n=3)

# Print top keywords for the first few documents
for i, keywords in enumerate(top_keywords_per_doc[:3]):
    print(f"Document {i+1}: {keywords}")

Document 1: [('action', 0.408248290463863), ('different', 0.408248290463863), ('use', 0.2721655269759087)]


In [None]:
keywords = Counter()
for topic_id in tqdm.tqdm(list(filtered_groups.keys()), desc="Extracting keywords"):
    corpus, _ = get_corpus_and_url(topic_id)
    if not corpus.strip():
        continue
    res, tfidf = fit_trans([corpus])
    terms = tfidf.get_feature_names_out()
    topic_keywords = get_top_keywords(res, terms, top_n=10)
    flat_keywords = [entry[0] for sublist in topic_keywords for entry in sublist]
    keywords.update(flat_keywords)

print(keywords.most_common(50), "\n")

Extracting keywords: 100%|██████████| 1211/1211 [10:33<00:00,  1.91it/s]

[('input', 374), ('button', 350), ('name', 324), ('action', 322), ('device', 229), ('condition', 223), ('description', 220), ('press', 213), ('light', 201), ('sensor', 182), ('time', 153), ('switch', 148), ('use', 147), ('selector', 144), ('event', 126), ('one', 116), ('remote', 116), ('turn', 111), ('state', 111), ('thanks', 104), ('home', 98), ('trigger', 97), ('set', 91), ('temperature', 88), ('would', 87), ('default', 82), ('brightness', 73), ('sequence', 72), ('control', 70), ('command', 70), ('scene', 69), ('get', 68), ('change', 67), ('like', 66), ('_id', 62), ('using', 61), ('notification', 59), ('value', 58), ('long', 55), ('mode', 55), ('service', 54), ('assistant', 52), ('hi', 49), ('double', 49), ('dimmer', 49), ('motion', 48), ('single', 47), ('ha', 47), ('error', 46), ('ing', 46)] 






In [32]:
for keyword in keywords.most_common(50):
    print(keyword)

('input', 374)
('button', 350)
('name', 324)
('action', 322)
('device', 229)
('condition', 223)
('description', 220)
('press', 213)
('light', 201)
('sensor', 182)
('time', 153)
('switch', 148)
('use', 147)
('selector', 144)
('event', 126)
('one', 116)
('remote', 116)
('turn', 111)
('state', 111)
('thanks', 104)
('home', 98)
('trigger', 97)
('set', 91)
('temperature', 88)
('would', 87)
('default', 82)
('brightness', 73)
('sequence', 72)
('control', 70)
('command', 70)
('scene', 69)
('get', 68)
('change', 67)
('like', 66)
('_id', 62)
('using', 61)
('notification', 59)
('value', 58)
('long', 55)
('mode', 55)
('service', 54)
('assistant', 52)
('hi', 49)
('double', 49)
('dimmer', 49)
('motion', 48)
('single', 47)
('ha', 47)
('error', 46)
('ing', 46)


### Rake

In [71]:
from multi_rake import Rake
rake_topic_id = list(filtered_groups.keys())[120]
rake_posts = db.get_posts_by_topic_id(rake_topic_id)
cleaned_texts_topic = [remove_html(post.cooked) for post in rake_posts]
cleaned_texts_topic.insert(0, str(topics.get(rake_topic_id).title))
rake = Rake(min_chars=3, max_words=1, language_code='en')
rake.stopwords.add('blueprint')
full_text = " ".join(cleaned_texts_topic)
keywords = rake.apply(full_text)
keywords, full_text, rake_posts[0].post_url

([('zha', 1.0),
  ('tweaked', 1.0),
  ('blueprint', 1.0),
  ('don’t', 1.0),
  ('move', 1.0),
  ('control', 1.0),
  ('switch', 1.0),
  ('badge', 1.0),
  ('import', 1.0),
  ('higher', 1.0)],
 'ZHA - IKEA Tradfri on/off remote dual function Inspired by  I tweaked his blueprint for the IKEA Tradfri On/Off switch. Because I don’t use the dimmer function I use the move and move_with_on_off events to control a switch.  Blueprint Code Click the badge to import this Blueprint: (needs Home Assistant Core 2021.3 or higher)',
 'https://community.home-assistant.io//t/zha-ikea-tradfri-on-off-remote-dual-function/350877/1')

### spaCy

In [74]:
import spacy 
nlp = spacy.load("en_core_web_sm")
spacy_topic_id = list(filtered_groups.keys())[120]
spacy_posts = db.get_posts_by_topic_id(spacy_topic_id)
cleaned_texts_spacy = [remove_html(post.cooked) for post in spacy_posts]
cleaned_texts_spacy.insert(0, str(topics.get(spacy_topic_id).title))
docs = nlp(" ".join(cleaned_texts_spacy))
entities = [(ent.text, ent.label_) for ent in docs.ents]
entities

[('ZHA - IKEA Tradfri', 'ORG'),
 ('the IKEA Tradfri On/Off', 'ORG'),
 ('move_with_on_off', 'ORG'),
 ('Blueprint Code Click', 'WORK_OF_ART'),
 ('2021.3', 'CARDINAL')]

### YAKE

In [9]:
import yake
yake_topic_id = list(filtered_groups.keys())[120]
yake_posts = db.get_posts_by_topic_id(yake_topic_id)
cleaned_texts_yake = [remove_html(post.cooked) for post in yake_posts]
cleaned_texts_yake.insert(0, str(topics.get(yake_topic_id).title))
yake_kw_extractor = yake.KeywordExtractor(lan="en", n=1, top=5)
yake_keywords = yake_kw_extractor.extract_keywords(" ".join(cleaned_texts_yake))
yake_keywords

[('Tradfri', 0.03103101640079281),
 ('IKEA', 0.03808461270350777),
 ('ZHA', 0.04764825324554439),
 ('Inspired', 0.0816591035917088),
 ('switch', 0.11185385012850924)]

In [13]:
def yake_preprocessing(text):
    text = remove_html(text)
    text = remove_html(text)
    text = text.lower()
    text = re.sub(r'’', r"'", text)
    #text = re.sub(r"[^\w'\s]", '', text)
    return text

In [16]:
yake_kw_extractor = yake.KeywordExtractor(lan="en", n=1, top=1)
yake_kw_extractor.stopword_set.add('blueprint')
keywords = Counter()
for topic_id in tqdm.tqdm(list(filtered_groups.keys())[:100], desc="Extracting keywords"):
    posts = db.get_posts_by_topic_id(topic_id)
    cleaned_texts = [yake_preprocessing(post.cooked) for post in posts]
    cleaned_texts.insert(0, yake_preprocessing(topics.get(topic_id).title))
    full_text = " ".join(cleaned_texts)
    tags = topics.get(topic_id).tags
    if isinstance(tags, str):
        try:
            tags = ast.literal_eval(tags)
        except (ValueError, SyntaxError):
            tags = [tags]
    #print(f"Topic ID: {topic_id}, Tags: {tags}")
    for tag in tags:
        yake_kw_extractor.stopword_set.add(tag.lower())
    yake_keywords = yake_kw_extractor.extract_keywords(full_text)
    yake_keywords = [kw for kw, score in yake_keywords]
    keywords.update(yake_keywords)
keywords

Extracting keywords: 100%|██████████| 100/100 [00:10<00:00,  9.89it/s]


Counter({'press': 8,
         'light': 5,
         'motion': 3,
         'ikea': 3,
         'entity': 3,
         'scene': 3,
         'time': 2,
         'simple': 2,
         'sensor': 2,
         'media': 2,
         'ozw': 2,
         'turn': 2,
         'temperature': 2,
         'android': 1,
         'fan': 1,
         'short': 1,
         'set': 1,
         'run': 1,
         'make': 1,
         'telegram': 1,
         'volume': 1,
         'audio': 1,
         'condition': 1,
         'double': 1,
         'action': 1,
         'red': 1,
         'device': 1,
         'battery': 1,
         'enter': 1,
         'actionable': 1,
         'config': 1,
         'window': 1,
         'lumi.sensor': 1,
         'mute': 1,
         'microphone': 1,
         'face': 1,
         'long': 1,
         'generic': 1,
         'node': 1,
         'master': 1,
         'amplifier': 1,
         'button': 1,
         'gist': 1,
         'jung': 1,
         'change': 1,
         'vacuum': 1,
 