In [2]:
import joblib
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import fetch_20newsgroups
import joblib


def get_mastodon_data(port, size):
    fission_url = f"http://localhost:{port}/mastodon/{size}"
    response = requests.get(fission_url, verify=False)
    return response.json()
    
port = 9000
size = 10000
data = get_mastodon_data(port, size)

categories = ['rec.sport.hockey', 'sci.crypt', 'talk.politics.mideast', 'sci.space']
data = fetch_20newsgroups(subset='train', categories=categories)
X_train = data.data

crime_keywords = ['crime', 'theft', 'murder', 'assault', 'fraud', 'robbery', 'burglary', 'arson', 
                  'kidnapping', 'drug trafficking', 'violence', 'vandalism', 'smuggling', 'extortion', 
                  'blackmail', 'embezzlement', 'bribery', 'corruption', 'homicide', 'manslaughter', 
                  'gang activity', 'terrorism', 'assault', 'battery', 'abuse', 'harassment', 
                  'molestation', 'rape', 'domestic violence', 'cybercrime', 'identity theft', 
                  'forgery', 'counterfeit', 'human trafficking', 'organized crime', 'illegal activity', 
                  'money laundering', 'perjury', 'prostitution', 'racketeering', 'sabotage', 'scam', 
                  'shoplifting', 'slander', 'stalking', 'swindle', 'terrorism', 'threat', 
                  'trespassing', 'underworld activity', 'weapon trafficking', 'conspiracy']

y_train = ['crime' if any(keyword in text.lower() for keyword in crime_keywords) else 'non-crime' for text in X_train]

pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())
pipeline.fit(X_train, y_train)

joblib.dump(pipeline.named_steps['tfidfvectorizer'], 'tfidf_vectorizer.joblib')
joblib.dump(pipeline.named_steps['logisticregression'], 'text_classifier.joblib')

filtered_data = [{'time': item['_source']['created_at'], 'content': item['_source']['content']} for item in data]

vectorizer = joblib.load('tfidf_vectorizer.joblib')
classifier = joblib.load('text_classifier.joblib')

contents = [item['content'] for item in filtered_data]
X = vectorizer.transform(contents)

predictions = classifier.predict(X)

crime_related_records = [content for content, pred in zip(contents, predictions) if pred == 'crime']

print(len(crime_related_records))

for content, label in zip(contents, predictions):
    print(content)
    print(label, '\n')



#for record in crime_related_records:
#    print(record)


0
Ich würde mich ja nicht allein auf den Wahlomat verlassen, aber ich bin ganz froh, dass ich am wenigsten mit der AfD übereinstimme.
non-crime 

Loneliness Is a Problem That A.I. Won’t SolveHuman touch matters. #presshttps://www.nytimes.com/2024/05/18/opinion/artificial-intelligence-loneliness.html?utm_source=press.coop
non-crime 

Disneyland character and parade performers in California vote to join labor unionDisneyland performers who help bring Mickey Mouse, Cinderella and other beloved characters to life have chosen to unionize following a three-day vote #presshttps://www.independent.co.uk/news/disneyland-ap-california-anaheim-cinderella-b2547455.html?utm_source=press.coop
non-crime 

Artist: notafurrytho https://www.furaffinity.net/view/56674451/ #furry #yiff #furryartwork
non-crime 


non-crime 

https://www.europesays.com/1226027/ MP @ Szentendre #hungary
non-crime 

sunbeams flicker offbirds flying into the clouds -fleeting vision ends#haiku #smallpoems Kigo: 鳥雲に入る (Tori kumo 