In [1]:
from transformers import pipeline
import requests
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

def get_mastodon_data(port,size):
    fission_url = f"http://localhost:{port}/mastodon/{size}"
    response = requests.get(fission_url, verify = False)
    return response.json()

port = 9000
size = 10000
data = get_mastodon_data(port,size)

filtered_data = [{'time': item['_source']['created_at'], 'content': item['_source']['content']} for item in data]

crime_keywords = ['crime', 'theft', 'murder', 'assault', 'fraud', 'robbery', 'burglary', 'arson', 
                  'kidnapping', 'drug', 'trafficking', 'violence','vandalism', 'smuggling', 'extortion', 
    'blackmail', 'embezzlement', 'bribery', 'corruption', 'homicide', 'manslaughter', 'gang', 
    'terrorist', 'terrorism', 'assault', 'battery', 'abuse', 'harassment', 'molestation', 
    'rape', 'domestic violence', 'cybercrime', 'identity theft', 'forgery', 'counterfeit', 
    'human trafficking', 'organized crime', 'illegal', 'unlawfully', 'laundering', 
    'money laundering', 'perjury', 'prostitution', 'racket', 'racketeering', 'sabotage', 
    'scam', 'shoplifting', 'slander', 'stalking', 'swindle', 'terrorism', 'threat', 
    'trespassing', 'underworld', 'weapon', 'weapons', 'smuggling', 'conspiracy']

def check_crime_related(content, keywords):
    stemmer = PorterStemmer()
    
    keyword_stems = {stemmer.stem(keyword) for keyword in keywords}
    
    content_lower = content.lower()
    content_clean = content_lower.translate(str.maketrans('', '', string.punctuation))
    
    words = word_tokenize(content_clean)
    word_stems = {stemmer.stem(word) for word in words}
    
    return any(keyword in word_stems for keyword in keyword_stems)


crime_related_records = [item for item in filtered_data if check_crime_related(item['content'], crime_keywords)]

classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

results = classifier([item['content'] for item in crime_related_records])


if len(crime_related_records) == 0:
    print('None of',size,'are related to crime')
else:
    print(len(crime_related_records),'of',size,'may be related to crime', '\n')
    for record, result in zip(crime_related_records, results):
        print('Time:', record['time'][0:10], record['time'][11:18])
        print('Content:',record['content'])
        print('Sentiment:', result['label'], '\n')

37 of 10000 may be related to crime 

Time: 2024-05-19 17:58:0
Content: Corrupt government models greedy theft to corporations who rob U.K. blind. As all could see our bills are being hiked with no opposition. Expect teethless Keith or the greens to do anything about it? Dream on. UK profiteering as study finds margins rose 30% post-pandemic.https://www.theguardian.com/business/article/2024/may/15/uk-firms-accused-of-profiteering-study-finds-margins-rose-30-percent-post-pandemic
Time: 2024-05-19 17:55:1
Content: ICYMI: #AMUpdate #News Lawmakers, advocates renew calls to pass missing and murdered Black women and girls task force bill https://wausaupilotandreview.com/2024/05/17/lawmakers-advocates-renew-calls-to-pass-missing-and-murdered-black-women-and-girls-task-force-bill/?utm_source=dlvr.it&utm_medium=mastodon
Time: 2024-05-19 17:48:4
Content: Reasons to ban llm stuff, per Gentoo Council member Michał Górny:“1. Copyright concerns.  At this point, the copyright situation around genera