In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from db.database import Database
from db.models import Blueprint
from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import stopwords
import pandas as pd
import ast
import numpy as np

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Adrian\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
db = Database()
topics = {topic.topic_id: topic for topic in db.get_topics()}
posts = {post.post_id: post for post in db.get_posts()}
blueprints = {bp.id: bp for bp in db.get_all_blueprints()}

Loading blueprints: 100%|██████████| 2232/2232 [00:01<00:00, 1521.49it/s]


In [3]:
print("work" in stopwords.words('english'))  # Check if "it" is in the stopwords list

False


In [14]:
def remove_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    for code_tag in soup.find_all('code', class_='lang-auto'):
        code_tag.decompose()
    
    for a_tag in soup.find_all('a'):
        a_tag.decompose()
    
    return soup.get_text().replace('\n', ' ').strip()

In [None]:
def preprocessing(text, ignorable_words=None):
    if ignorable_words is None:
        ignorable_words = []
    ignorable_words = ignorable_words + ["blueprint", "automation", "entity", "work"]
    text = remove_html(text)
    lemmatizer = WordNetLemmatizer()
    text = text.lower()
    text = re.sub(r'’', r"'", text)
    text = re.sub(r"[^\w'\s]", '', text)
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)
    text = re.sub('|'.join(ignorable_words), '', text, flags=re.IGNORECASE)
    return text

In [16]:
def fit_trans(str: list[str]):
    tfidf = TfidfVectorizer()
    res = tfidf.fit_transform(str)
    return res, tfidf

In [17]:
def get_corpus_and_url(topic_id):
    topic = topics.get(topic_id)
    posts = db.get_posts_by_topic_id(topic_id)
    tags = topic.tags
    if isinstance(tags, str):
        try:
            tags = ast.literal_eval(tags)
            lemmatizer = WordNetLemmatizer()
            tags = [lemmatizer.lemmatize(tag) for tag in tags]
        except (ValueError, SyntaxError):
            tags = [tags]
            lemmatizer = WordNetLemmatizer()
            tags = [lemmatizer.lemmatize(tag) for tag in tags]
    post_contents = [post.cooked for post in posts]
    corpus = [preprocessing(text, ignorable_words=tags) for text in post_contents]
    corpus = " ".join(corpus)
    return corpus, topic.topic_url

corpus, url = get_corpus_and_url(list(topics.keys())[120])
corpus, url

("hi namron 8button switch want use different thing send action example  ing try use 4 different action 4 different thing dim turn turn  se action changed back  processed option lightzigbee_23 wallwitch lightzigbee_24 lamp home can't test look right",
 'https://community.home-assistant.io//t/one-automation-to-manage-all-buttons-on-a-switch/336607')

In [8]:
res, tfidf = fit_trans([corpus])
terms = tfidf.get_feature_names_out()
df = pd.DataFrame(res.toarray(), columns=terms)
print(df)

    8button    action      back       can   changed  different       dim  \
0  0.136083  0.408248  0.136083  0.136083  0.136083   0.408248  0.136083   

    example        hi      home  ...        se      send    switch      test  \
0  0.136083  0.136083  0.136083  ...  0.136083  0.136083  0.136083  0.136083   

      thing       try      turn       use  wallwitch      want  
0  0.272166  0.136083  0.272166  0.272166   0.136083  0.136083  

[1 rows x 29 columns]


In [18]:
def get_top_keywords(tfidf_matrix, feature_names, top_n=5):
    """
    Extract top keywords for each document based on TF-IDF scores.
    
    Args:
        tfidf_matrix: The sparse matrix of TF-IDF scores.
        feature_names: List of feature names (terms).
        top_n: Number of top keywords to extract per document.
    
    Returns:
        List of lists containing top keywords with score for each document.
    """
    top_keywords = []
    for row in tfidf_matrix:
        # Convert sparse row to dense array
        row_array = row.toarray().flatten()
        # Get indices of top N scores
        top_indices = np.argsort(row_array)[-top_n:][::-1]
        # Map indices to feature names
        top_terms = [(feature_names[i], row_array[i]) for i in top_indices]
        top_keywords.append(top_terms)
    return top_keywords

# Get top keywords for each document
top_keywords_per_doc = get_top_keywords(res, terms, top_n=5)

# Print top keywords for the first few documents
for i, keywords in enumerate(top_keywords_per_doc[:5]):
    print(f"Document {i+1}: {keywords}")

Document 1: [('zone', 0.5773502691896258), ('leave', 0.5773502691896258), ('duplicate', 0.5773502691896258)]


In [19]:
import tqdm
from util.lang_identification import identify_language

non_english = {} 
for bp in tqdm.tqdm(blueprints.values(), desc="Detecting languages"):
    lang = identify_language(bp)
    if lang != 'en':
        non_english[bp] = lang
        
for bp in non_english:
    blueprints.pop(bp.id)
blueprints.__len__()

groups = db.get_blueprints_per_topic()

english_ids = set(blueprints.keys())   
filtered_groups = {
    topic_id: [bp for bp in bps if bp.id in english_ids]
    for topic_id, bps in groups.items()
}

Detecting languages: 100%|██████████| 2196/2196 [00:35<00:00, 61.06it/s]


In [None]:
from collections import Counter
keywords = Counter()
for topic_id in list(filtered_groups.keys()):
    corpus, _ = get_corpus_and_url(topic_id)
    if not corpus.strip():
        continue
    res, tfidf = fit_trans([corpus])
    terms = tfidf.get_feature_names_out()
    topic_keywords = get_top_keywords(res, terms, top_n=10)
    topic_keywords = [entry[0] for entry in topic_keywords]
    keywords.update(topic_keywords)

print(keywords.most_common(50))

### Rake testing

In [50]:
from multi_rake import Rake
rake = Rake()
full_text = " ".join(cleaned_texts_topic)
keywords = rake.apply(full_text)
keywords

[('stage trigger monitor', 9.0),
 ('state trigger monitors', 8.0),
 ('sensor’s action attribute', 6.833333333333334),
 ('action attribute', 4.5),
 ('sensor’s state', 4.333333333333334),
 ('short press', 4.0),
 ('long press', 4.0),
 ('works great', 4.0),
 ('z2m binding', 4.0),
 ('brightness level', 4.0),
 ('can’t answer', 4.0),
 ('ikea switch', 4.0),
 ('variable refers', 4.0),
 ('hobby programmer', 4.0),
 ('made product', 4.0),
 ('on/off switch', 3.666666666666667),
 ('realy don’t', 3.5),
 ('on/off function', 3.166666666666667),
 ('state', 2.0),
 ('on/off', 1.6666666666666667),
 ('2 function', 1.5),
 ('don’t', 1.5),
 ('based', 1.0),
 ('it’s', 1.0),
 ('code', 1.0),
 ('control', 1.0),
 ('lights', 1.0),
 ('dim', 1.0),
 ('bunch', 1.0),
 ('bad', 1.0),
 ('maybee', 1.0),
 ('rewrite', 1.0),
 ('dimming', 1.0),
 ('examples', 1.0),
 ('question', 1.0),
 ('zigbee2mqtt', 1.0),
 ('i’m', 1.0),
 ('rebuild', 1.0)]