In [3]:
import miniflux
import nltk
import os
from dotenv import load_dotenv
load_dotenv()

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('vader_lexicon')

MINIFLUX_URL = "https://rss.homehub.tv"
API_KEY = os.getenv("MINIFLUX_API_KEY")

# Initialize the Miniflux client
client = miniflux.Client(MINIFLUX_URL, api_key=API_KEY)

def get_feed_ids():
    """Fetch and return all available feed IDs."""
    # Assuming the client has a method to fetch all feeds
    feeds = client.get_feeds()
    feed_ids = [feed["id"] for feed in feeds]
    return feed_ids

[nltk_data] Downloading package stopwords to /home/codyt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/codyt/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [33]:
def remove_duplicates(feed_ids):
    dupe_ids = []
    for feed_id in feed_ids:
        entries = client.get_feed_entries(feed_id=feed_id, limit=10000, starred=False, status=["read","unread"])
        seen_urls = set()
        seen_titles = set()
        for entry in entries["entries"]:
            if ((entry["url"] in seen_urls) or (entry["title"] in seen_titles)):
                dupe_ids.append(entry["id"])
                # print("Duplicate found " + entry["title"])
            else:
                seen_urls.add(entry["url"])
                seen_titles.add(entry["title"])
    if dupe_ids: # Repeats found
        client.update_entries(dupe_ids, status="removed")

if __name__ == "__main__":
    # Get the feed_ids
    feed_ids = get_feed_ids()


In [1]:
# Options

# Set Current Blocklist
current_blocklist = "EntryTitle=(?i)(49ers|86|aaron\ rodgers|abandoned|abuse|abused|academic\ documents|accident|accidental|accidentally|accusation|accuse|accused|adhd|adversaries|adversity|afc|aggression|agitator|airport|alarm|alarming|alarmingly|alarms|alive|allegations|allegedly|allergic|alone|amazon|amazons|amendment|anger|angry|annoy|anti|antisemitism|anxiety|appeal|apple|arguments|armed|arrest|arrested|asbestos|assassination|assault|assaulted|at\&t|attack|attacked|attacking|attacks|averted|avoid|avoids|awkward|axe|baby|bacteria|bad|ban|banned|barrier|basketball|battle|battlefield|battles|battling|berated|bias|biased|bike|bing|bird\ flu|bird\-flu|bitter|bittersweet|black\ friday|blame|blamed|blames|block|blocked|blocking|blocks|blurry|bomb|border|bored|boredom|bores|boring|botox|boxing|bribe|broke|broken|brutal|bullshit|burglar|cadillac|california|california\ house\ race|cancel|cancer|canva|car\ crash|case|catastrophic|cautious|cbs|championship|chaos|chaotic|charged|cheat|cheating|chicago|child\ care|childcare|china|chokes|christmas|cnn|coach|collapse|collapsed|collapses|collision|coma|combat|complaining|complaint|concert|condemns|confuse|confused|confuses|confusing|confusion|conspiracy|contagious|contamination|contempt|controversial|convicted|conviction|coral\ death|costly|counterprotest|coup|court|covid|cowboys|crash|crazy|crime|criminal|crisis|critical|critics|crude|cry|cut|cuts|cutting|cynics|damage|damaging|damn|dancing\ with\ the\ stars|danger|dangerous|darkest|data\ breach|daydreaming|dead|deadly|deal|deals|death|debt|decay|defeat|defects|defendant|dehumanizing|dei|delay|delta|demand|demanding|democrat|democrats|dems|denied|denies|depressed|depression|desperate|desperate\ housewives|desperation|destroy|destruct|detained|devastating|diddy|die|dies|diet\ shifts|dilemma|dirt|dirty|disabilities|disagree|disappeared|disappointments|disaster|discarded|discord|discounts|discriminated|disliked|dislikes|disorder|disruption|disruptions|distractions|disturb|diverse|divisive|dnc|doj|dominating|doomsday|doubt|downing|downside|dragged|dread|drop|drought|drowned|drugs|drunk|dump|dumping|dwellers|e\.\ coli|e\.\ coli\ outbreak|earnings|election|emissions|empty|endangered|envy|error|errors|evacuates|eviction|evil|excel|execution|exhausting|expensive|exploit|exploits|expose|exposed|exposes|exposing|extinction|f1|faa|face\ the\ nation|faces|fail|failed|failing|fails|failure|failures|fake|faking|fall|falling|fans|farewell|fashion|fatal|fatigue|favorite|fbi|fear|fears|fight|fighting|fire|firefox|fires|firing|fitness|flood|flu|flunks|fool|fooled|fools|football|forced|forget|fought|frantic|fraud|freedom|frustrating|fu|fuck|fucked|fun\ facts|funeral|funneling|gambling|game|gameday|games|gaming\ monitor|gang|geek|gemini|gender|ghost|giants|gifts|gloomy|google\ maps|google\ play|gop|governor|guilty|gun|hacked|haitian|hard|hardships|harris|harvey\ weinstein|hate|haunting|havoc|hell|hezbollah|hides|hiding|hoax|holiday|horrifying|horror|horrors|hostage|hostile|house\ ethics|house\ oversight\ committee|house\ race|hurt|icloud|identity\ politics|ignore|ignored|ignoring|ill|illegal|illnesses|immigrants|immigration|inability|incident|indicted|infected|infringement|inhibition|injury|insane|inspire|ipad|iphone|iran|irate|irrational|isolation|israel|israeli|jail|jealous|jerk\ face|job\ interviews|judge|kill|killed|killer|killing|kills|l\.a\.|law|lawsuit|lawsuits|leak|leaked|leather\ cases|leave|leaves|liable|light\ saber|limited|lonely|looming|looms|loose|los\ angeles|lose|loses|losing|loss|losses|lost|low|lowdown|lower|lowering|lowers|lowest|lurking|lying|mad|majority|malaria|manipulating|march|marine\ algae|matt\ gaetz|mcdonald's|medicare|mess|michelle\ obama|microplastics|microsoft|migrant|minimum\ wage|minorities|miserable|misinformation|miss|missed|missing|mistakes|misunderstands|mock|mocking|motherhood|mourn|msnbc|murder|mutilated|naive|nasty|nazi|nazis|neck\ tattoo|negativity|neglected|neglecting|neo\-nazi|nerves|nervous|nfc|nfl|nhl|nintendo|north\ korea|notorious|nwsl|obese|obsessed|obsession|obsolete|obstacles|offenders|officials|offline|outrage|overweight|ovulation|packers|pain|pandemic|papers|paradox|parole|patients|pay|penalty|plane|playstation|pleads|poisoned|poisoning|pokémon|police|political|pollution|poor|poorer|postpones|postponing|poverty|pregnancy|pregnant|prejudice|pressure|preventing|price|prick|pride|prison|problem|problematic|problems|prosecution|protest|punished|putin|quarterback|questioning|races|radical|rainbow|rampant|rape|rebel|reckless|reject|rejection|rejects|relationship|renters|repetitive|repressive|republican|republicans|resentment|resigns|restless|revenge|review|ridiculously|rig|riot|risk|risks|risky|rnc|ruining|ruptured|russia|sabotage|sad|sadder|safari\ sync|safety|samsung|sanctuary|save|scam|scams|scare|school|screaming|screams|scrutiny|senate|sentenced|sentencing|serious|severely|sexual|sexually|shake|shame|shattered|shit|shock|shocking|shoot|shooter|shooting|shot|sick|sickened|sin|sisterhood|skeptical|skin|skull\ whistles|slammed|slams|slash|slashes|slave|sluggish|smile|smog|snafu|soccer|sony|sorry|spam|spill|spotify|spree|stadium|stalker|stalled|state|steal|stealing|stealth|stealthy|steam|stimulus|stolen|stop|stopped|stopping|stops|strange|stress|stressful|strike|strikes|struggle|struggles|struggling|stubborn|stuck|stupid|stupidly|suck|sucked|sucks|sue|sues|suffered|suffers|sunday|super\ bowl|suspect|suspected|suspicion|suspicious|t\-mobile|tantrums|tax\ credits|taylor\ swift|team\ meeting|teases|teasing|tension|terrible|terrifying|terror|terrorism|terrorist|terrorists|testicles|testify|thanksgiving|the\ view|threat|threatens|threats|tiktok|tired|touchdown|tough|toughest|trafficking|tragic|trans|transgender|trap|trauma|traumatic|travis\ kelce|trial|trick|tricks|tricky|trivial|trouble|troubles|troubling|tuituion|tumor|tumultuous|turmoil|twin\ sister|ugly|ukraine|unappreciated|uncertainty|uncomfortable|undermine|unfortunate|unfulfilled|unintelligent|united|unjust|unmatched|unstable|usb|useless|vaccine|vacuum|valentines|victims|victory|violation|violence|violently|voters|vulnerabilities|vulnerability|vulnerable|war|warn|warned|warning|warns|warrant|wars|waste|weaknesses|weapons|weather|weird|wicked|wildfires|windows|winning|winter|witness|wnba|woes|woke|word|wordpress\.com|world\ of\ warcraft|worn|worried|worrying|worse|worsens|worst|worthless|wrestlemania|wrestling|wrong|wwe|xbox|zelenskyy)"

In [4]:
import re
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

def extract_keywords(feed_entries):
    """
    Step 1: Extracts and deduplicates keywords from article titles per feed, removing stop words.

    Args:
        feed_entries (dict): A dictionary where keys are feed IDs and values are lists of article titles.

    Returns:
        dict: A dictionary where keys are feed IDs and values are deduplicated lists of keywords.
    """
    keyword_dict = defaultdict(set)
    stop_words = set(stopwords.words('english'))  # Load English stop words

    for feed_id, titles in feed_entries.items():
        for title in titles:
            # Extract words using regex and convert to lowercase
            words = re.findall(r"\b\w+\b", title.lower())
            # Remove stop words
            filtered_words = [word for word in words if word not in stop_words]
            keyword_dict[feed_id].update(filtered_words)

    # Convert sets to lists for the final output
    return {feed_id: list(keywords) for feed_id, keywords in keyword_dict.items()}

def combine_and_deduplicate_keywords(keywords_per_feed):
    """
    Step 2 & 3: Combines all keywords into one list, removes stop words, and deduplicates.

    Args:
        keywords_per_feed (dict): A dictionary where keys are feed IDs and values are lists of keywords.

    Returns:
        list: A deduplicated list of all combined keywords.
    """
    all_keywords = []
    for keywords in keywords_per_feed.values():
        all_keywords.extend(keywords)

    # Remove duplicates (set) and return as a list
    return list(set(all_keywords))

def get_negative_keywords(all_keywords):
    """
    Step 4: Processes combined keywords using NLTK VADER and returns negative keywords.

    Args:
        all_keywords (list): A list of combined, deduplicated keywords.

    Returns:
        list: A list of negative sentiment keywords.
    """
    analyzer = SentimentIntensityAnalyzer()
    
    # Filter out keywords with negative sentiment (compound score < 0)
    return [keyword for keyword in all_keywords if analyzer.polarity_scores(keyword)['compound'] < 0]

def process_feeds(feed_ids):
    """
    Orchestrates the structured process to extract, combine, filter, and analyze negative sentiment keywords.

    Args:
        feed_ids (list): List of feed IDs.

    Returns:
        None
    """
    # Simulate fetching feed entries (replace this with actual client call)
    feed_entries = {}
    
    for feed_id in feed_ids:
        entries = client.get_feed_entries(feed_id=feed_id, limit=10000, starred=False, status=["read", "unread"])
        
        if entries["entries"]:
            titles = [entry["title"] for entry in entries["entries"]]
            feed_entries[feed_id] = titles

    # Step 1: Extract keywords per feed
    keywords_per_feed = extract_keywords(feed_entries)

    # Step 2 & 3: Combine all keywords into one list and deduplicate
    combined_keywords = combine_and_deduplicate_keywords(keywords_per_feed)

    # Step 4: Run sentiment analysis to find negative keywords
    negative_keywords = get_negative_keywords(combined_keywords)

    # Print results
    print(f"Negative Keywords: {negative_keywords}")

    return negative_keywords

if __name__ == "__main__":
    try:
        # Get feed IDs (replace with actual function or data)
        feed_ids = get_feed_ids()

        # Process feeds to extract and analyze negative sentiment keywords
        negative_keywords = process_feeds(feed_ids)
    
    except NameError as e:
        print(f"Error: {e}")


Negative Keywords: ['craziest', 'defect', 'dispute', 'shaky', 'passionless', 'confrontation', 'uncomfortably', 'avert', 'complain', 'violated', 'wtf', 'derails', 'noisy', 'feuds', 'upset', 'hesitant', 'pesky', 'expelled', 'fiery', 'worries', 'harm', 'distorts', 'unsettled', 'screws', 'unemployment', 'hide', 'exclusion', 'ignorant', 'suicide', 'fault', 'charges', 'disappears', 'censored', 'difficulty', 'dismal', 'contentious', 'slam', 'weaken', 'neuroticism', 'insulted', 'wreck', 'uncertain', 'harsh', 'lies', 'backs', 'scold', 'severe', 'nuts', 'stunned', 'overlooked', 'defer', 'worry', 'disappointed', 'suspend', 'scandal', 'shortage', 'grievances', 'enemies', 'smear', 'frustrated', 'offending', 'puking', 'rages', 'unwelcome', 'lazy', 'colliding', 'wasting', 'bloody', 'gross', 'raging', 'greedy', 'weapon', 'graveyard', 'hating', 'distracted', 'crush', 'defiant', 'adversarial', 'chastises', 'misleading', 'degrade', 'uncontrollably', 'shaking', 'dodgy', 'difficult', 'freezing', 'beating',

In [8]:
def generate_miniflux_blocklist(bad_keywords):
    """
    Converts a list of bad_keywords into a Miniflux blocklist regex format.

    Args:
        bad_keywords (list): A list of bad_keywords to include in the regex.

    Returns:
        str: A formatted Miniflux blocklist string.
    """
    # Escape special characters in bad_keywords to ensure they are treated literally
    import re
    escaped_bad_keywords = [re.escape(keyword) for keyword in bad_keywords]
    
    # Join the bad_keywords with the regex OR operator '|'
    regex_pattern = "|".join(escaped_bad_keywords)
    
    # Format the final output with case-insensitivity flag (?i)
    miniflux_blocklist = f"EntryTitle=(?i)({regex_pattern})"
    
    return miniflux_blocklist

# Custom keywords to block
custom_keywords_removal = [
    'ripped',
    'despicable'
]


blocklist = generate_miniflux_blocklist(negative_keywords + custom_keywords_removal)
print(blocklist)

EntryTitle=(?i)(craziest|defect|dispute|shaky|passionless|confrontation|uncomfortably|avert|complain|violated|wtf|derails|noisy|feuds|upset|hesitant|pesky|expelled|fiery|worries|harm|distorts|unsettled|screws|unemployment|hide|exclusion|ignorant|suicide|fault|charges|disappears|censored|difficulty|dismal|contentious|slam|weaken|neuroticism|insulted|wreck|uncertain|harsh|lies|backs|scold|severe|nuts|stunned|overlooked|defer|worry|disappointed|suspend|scandal|shortage|grievances|enemies|smear|frustrated|offending|puking|rages|unwelcome|lazy|colliding|wasting|bloody|gross|raging|greedy|weapon|graveyard|hating|distracted|crush|defiant|adversarial|chastises|misleading|degrade|uncontrollably|shaking|dodgy|difficult|freezing|beating|hid|freaking|imperfect|foes|mistake|resigned|dominate|odd|lazier|abusive|restrict|disabling|blind|isolated|enemy|crap|lame|interruption|intimidate|discomfort|grave|indecisive|kia|faulty|hysteria|distorting|cries|provoked|imposes|criticizing|strangled|aggressive|cr

In [9]:
def combine_miniflux_blocklists(blocklists):
    """
    Combines multiple Miniflux blocklists into a single blocklist.

    Args:
        blocklists (list): A list of Miniflux blocklist strings.

    Returns:
        str: A single combined Miniflux blocklist string.
    """
    import re

    # Extract keywords from each blocklist
    combined_keywords = []
    for blocklist in blocklists:
        # Extract the part inside the parentheses
        match = re.search(r"\(\?i\)\((.*)\)", blocklist)
        if match:
            keywords = match.group(1).split('|')
            combined_keywords.extend(keywords)

    # Convert all keywords to lowercase
    combined_keywords = [keyword.lower() for keyword in combined_keywords]

    # Remove duplicates and sort keywords
    unique_keywords = sorted(set(combined_keywords))

    # Join the keywords with the regex OR operator '|'
    combined_regex = '|'.join(unique_keywords)

    # Format the final output with case-insensitivity flag (?i)
    combined_blocklist = f"EntryTitle=(?i)({combined_regex})"

    return combined_blocklist


# Example usage
blocklists = [
    current_blocklist,
    blocklist
]

combined_blocklist = combine_miniflux_blocklists(blocklists)
print(combined_blocklist)




In [None]:
import re

def remove_unwanted_articles(feed_ids, blocklist):
    """Process articles to remove those matching blocklist regex rules."""
    total_removed = 0

    # Clean the blocklist by removing "EntryTitle=(?i)" and the surrounding parentheses
    if blocklist.startswith("EntryTitle=(?i)(") and blocklist.endswith(")"):
        cleaned_blocklist = blocklist[len("EntryTitle=(?i)("):-1]
    else:
        print("Invalid blocklist format. Ensure it follows 'EntryTitle=(?i)(...)'.")
        return

    # Compile the cleaned regex pattern for case-insensitive matching
    regex = re.compile(cleaned_blocklist, re.IGNORECASE)

    for feed_id in feed_ids:
        # Fetch entries from the feed
        entries = client.get_feed_entries(feed_id=feed_id, limit=10000, starred=False, status=["read", "unread"])
        removed = []  # Prepare a list for bulk removing

        for entry in entries["entries"]:
            entry_id = entry["id"]
            title = entry["title"]

            # Check if the article title matches the regex
            if regex.search(title):
                # Add to the list for bulk removing
                removed.append(entry_id)

        # Perform bulk removing of bad articles
        if removed:
            client.update_entries(removed, status="removed")
            total_removed += len(removed)
            print(f"Removed {len(removed)} articles from feed {feed_id}.")

    print(f"Finished processing feeds. Total removed: {total_removed}")


if __name__ == "__main__":

    # Get the feed_ids
    feed_ids = get_feed_ids()
    
    # Ensure 'client' is defined and authenticated before calling the function
    remove_unwanted_articles(feed_ids, combined_blocklist)


Finished processing feeds. Total removed: 0
