# Classification
In the first cell all the english messages about cybersecurity groups of the GroupMonitoringRelease database will be classified and saved in the corresponding csv file.

In [None]:
import os
import re
import torch
import emoji
import csv
import pandas as pd
from pymongo import MongoClient
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()

# ==============================================================================
# 1. CONFIGURAZIONE
# ==============================================================================
MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = "GroupMonitoringRelease"
PERCORSO_MODELLO = "/kaggle/input/model-securebert/pytorch/default/1/modello_finale"
SOGLIA_CYBER = 0.5  # Alzata leggermente per ridurre falsi positivi (Replied, ecc.)

# Cartella Output
OUTPUT_FOLDER = "Results_CSV"
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

# LISTA COMPLETA GRUPPI CYBER INGLESI (Targettizzati)
GRUPPI_DA_ANALIZZARE = [
    # 1. Gruppi "Core" Cybersecurity & InfoSec
    "Cyber Security - Information Security - IT Security - Experts",
    "Hacking Realm",
    "VirusCheck Chat",  # Ottimo per hash di malware
    
    # 2. Network Security & Attacks (DDoS, Tunneling)
    "DDOS‰∫§ÊµÅÊîªÂáª",     # Attacchi DDoS (Spesso pieno di IP target)
    "HTTP Injector",    # Network Tunneling (Usato spesso per bypass)
    "HTTP Injector Chat",
    "Mikrotik-Training", # Configurazione Router/Firewall (Spesso discussioni su attacchi)
    
    # 3. Darknet & Underground (Russian/English Mixed)
    "Only Dark",
    "–¢–µ–Ω–µ–≤–æ–π –î–∞—Ä–∫–Ω–µ—Ç –ß–∞—Ç", # "Shadow Darknet Chat"
    "–•–∞–∫–µ—Ä—ã |–ß–∞—Ç| ùìóùì™ùì¨ùì¥ùìÆùìªùìº ùì¨ùì±ùì™ùìΩ",
    
    # 4. Attori Malevoli Specifici
    "–ß–∞—Ç –¥–æ–∫—Å–µ—Ä–æ–≤",     # Doxing (Spesso condividono leak di dati personali)
    "HackDroids ‚Äî –ß–∞—Ç"  # Hacking Android / Mobile Malware
]

LIMITE_MESSAGGI_PER_GRUPPO = None # Analizza tutto lo storico

# ==============================================================================
# 2. CARICAMENTO MODELLO (GPU CHECK)
# ==============================================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"‚öôÔ∏è Caricamento modello su: {device.upper()}...")

try:
    tokenizer = RobertaTokenizer.from_pretrained(PERCORSO_MODELLO)
    model = RobertaForSequenceClassification.from_pretrained(PERCORSO_MODELLO).to(device)
    model.eval()
    print("‚úÖ Modello caricato e pronto.\n")
except Exception as e:
    print(f"‚ùå ERRORE: Impossibile caricare il modello. {e}")
    exit()

# ==============================================================================
# 3. FUNZIONI (Clean & Classify - NO TRANSLATION)
# ==============================================================================
def clean_and_mask(text):
    if not isinstance(text, str): return ""
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'CVE-\d{4}-\d+', '[CVE]', text, flags=re.IGNORECASE)
    text = re.sub(r'(?:https?://)?(?:www\.)?(?:t\.me|telegram\.me)/[a-zA-Z0-9_]+', '[TG_LINK]', text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '[URL]', text)
    text = re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', '[IP]', text)
    text = re.sub(r'\b(?:[a-zA-Z0-9-]+\.)+(?:com|org|net|io|ru|cn|it|uk|gov)\b', '[DOMAIN]', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def classifica_messaggio_batch(testi):
    """Classifica una lista di messaggi in una volta sola (Batch Processing per GPU)"""
    testi_puliti = [clean_and_mask(t) for t in testi]
    
    try:
        # Tokenizzazione Batch
        inputs = tokenizer(testi_puliti, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
        # Restituisce la colonna 1 (Probabilit√† Cyber) come lista Python
        return probs[:, 1].tolist()
    except Exception as e:
        # Fallback in caso di errore strano su un batch
        return [0.0] * len(testi)

# ==============================================================================
# 4. CONNESSIONE DB & CORE LOGIC
# ==============================================================================
print("üîå Connessione a MongoDB...")
try:
    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
    db = client[DB_NAME]
    groups_coll = db["groups"]
    client.server_info() # Trigger connection check
except Exception as e:
    print(f"‚ùå Errore connessione DB: {e}")
    print("‚ö†Ô∏è Se sei su Kaggle, verifica che l'IP dell'Universit√† accetti connessioni esterne!")
    exit()

query_gruppi = {"chat_name": {"$in": GRUPPI_DA_ANALIZZARE}}
target_groups = list(groups_coll.find(query_gruppi))
print(f"üìã Trovati {len(target_groups)} gruppi target.")

# ==============================================================================
# 5. CICLO DI ANALISI OTTIMIZZATO (BATCHING)
# ==============================================================================
BATCH_SIZE = 32 # Numero di messaggi analizzati contemporaneamente dalla GPU

for group_doc in target_groups:
    coll_name = group_doc.get("collection_name")
    chat_name = group_doc.get("chat_name", "Unknown")
    
    # Pulizia nome file (rimuove caratteri illegali per Windows/Linux)
    safe_filename = re.sub(r'[\\/*?:"<>|]', "", chat_name).replace(" ", "_")
    csv_path = os.path.join(OUTPUT_FOLDER, f"{safe_filename}.csv")
    
    if not coll_name: continue

    print(f"\n{'='*60}")
    print(f"üìÇ Gruppo: {chat_name}")
    
    msg_coll = db[coll_name]
    cursor = msg_coll.find({}).sort("date", -1)
    if LIMITE_MESSAGGI_PER_GRUPPO: cursor = cursor.limit(LIMITE_MESSAGGI_PER_GRUPPO)
    
    # Carichiamo in memoria per fare batching (se RAM < 16GB, ridurre query limit)
    msgs = list(cursor)
    total_msgs = len(msgs)
    print(f"   Messaggi totali: {total_msgs}")
    
    if total_msgs == 0: continue

    risultati_gruppo = []
    
    # Processiamo a blocchi (Batch)
    for i in tqdm(range(0, total_msgs, BATCH_SIZE), desc="GPU Processing"):
        batch_docs = msgs[i : i + BATCH_SIZE]
        
        # Filtriamo testi validi
        valid_texts = []
        valid_indices = []
        
        for idx, doc in enumerate(batch_docs):
            text = doc.get("message", "")
            if text and len(text) > 4:
                valid_texts.append(text)
                valid_indices.append(idx)
        
        if not valid_texts: continue
        
        # INFERENZA GPU
        scores = classifica_messaggio_batch(valid_texts)
        
        # Salvataggio risultati batch
        for local_idx, score in zip(valid_indices, scores):
            if score >= SOGLIA_CYBER:
                doc = batch_docs[local_idx]
                risultati_gruppo.append({
                    "message_id": doc.get("id"),
                    "date": doc.get("date"),
                    "user": doc.get("from_id"), # O username se disponibile
                    "text": valid_texts[valid_indices.index(local_idx)], # Testo originale
                    "score": round(score, 4)
                })

    # SALVATAGGIO CSV DEL GRUPPO
    if risultati_gruppo:
        df_res = pd.DataFrame(risultati_gruppo)
        df_res.to_csv(csv_path, index=False)
        print(f"   ‚úÖ Salvato: {csv_path} ({len(risultati_gruppo)} minacce)")
    else:
        print("   üí§ Nessuna minaccia trovata in questo gruppo.")

print("\n" + "="*60)
print(f"üèÅ Finito! Controlla la cartella '{OUTPUT_FOLDER}'")
client.close()

2025-11-25 14:04:14.261520: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764079454.459540      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764079454.514956      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

‚öôÔ∏è Caricamento modello su: CUDA...
‚úÖ Modello caricato e pronto.

üîå Connessione a MongoDB...
üìã Trovati 12 gruppi target.

üìÇ Gruppo: HTTP Injector Chat
   Messaggi totali: 25042


GPU Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 783/783 [00:54<00:00, 14.40it/s]


   ‚úÖ Salvato: Results_CSV/HTTP_Injector_Chat.csv (3114 minacce)

üìÇ Gruppo: Cyber Security - Information Security - IT Security - Experts
   Messaggi totali: 9489


GPU Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 297/297 [00:29<00:00, 10.03it/s]


   ‚úÖ Salvato: Results_CSV/Cyber_Security_-_Information_Security_-_IT_Security_-_Experts.csv (351 minacce)

üìÇ Gruppo: Hacking Realm
   Messaggi totali: 4128


GPU Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 129/129 [00:27<00:00,  4.66it/s]


   ‚úÖ Salvato: Results_CSV/Hacking_Realm.csv (145 minacce)

üìÇ Gruppo: HTTP Injector
   Messaggi totali: 37359


GPU Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1168/1168 [02:30<00:00,  7.74it/s]


   ‚úÖ Salvato: Results_CSV/HTTP_Injector.csv (5095 minacce)

üìÇ Gruppo: Mikrotik-Training
   Messaggi totali: 14735


GPU Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 461/461 [01:41<00:00,  4.54it/s]


   ‚úÖ Salvato: Results_CSV/Mikrotik-Training.csv (2909 minacce)

üìÇ Gruppo: Only Dark
   Messaggi totali: 168934


GPU Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5280/5280 [23:37<00:00,  3.73it/s]


   ‚úÖ Salvato: Results_CSV/Only_Dark.csv (41580 minacce)

üìÇ Gruppo: –•–∞–∫–µ—Ä—ã |–ß–∞—Ç| ùìóùì™ùì¨ùì¥ùìÆùìªùìº ùì¨ùì±ùì™ùìΩ
   Messaggi totali: 61326


GPU Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1917/1917 [04:56<00:00,  6.46it/s]


   ‚úÖ Salvato: Results_CSV/–•–∞–∫–µ—Ä—ã_–ß–∞—Ç_ùìóùì™ùì¨ùì¥ùìÆùìªùìº_ùì¨ùì±ùì™ùìΩ.csv (22401 minacce)

üìÇ Gruppo: –¢–µ–Ω–µ–≤–æ–π –î–∞—Ä–∫–Ω–µ—Ç –ß–∞—Ç
   Messaggi totali: 74962


GPU Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2343/2343 [09:37<00:00,  4.06it/s]


   ‚úÖ Salvato: Results_CSV/–¢–µ–Ω–µ–≤–æ–π_–î–∞—Ä–∫–Ω–µ—Ç_–ß–∞—Ç.csv (13546 minacce)

üìÇ Gruppo: DDOS‰∫§ÊµÅÊîªÂáª
   Messaggi totali: 42272


GPU Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1321/1321 [06:55<00:00,  3.18it/s]


   ‚úÖ Salvato: Results_CSV/DDOS‰∫§ÊµÅÊîªÂáª.csv (3598 minacce)

üìÇ Gruppo: –ß–∞—Ç –¥–æ–∫—Å–µ—Ä–æ–≤
   Messaggi totali: 316386


GPU Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9888/9888 [22:34<00:00,  7.30it/s]  


   ‚úÖ Salvato: Results_CSV/–ß–∞—Ç_–¥–æ–∫—Å–µ—Ä–æ–≤.csv (120895 minacce)

üìÇ Gruppo: HackDroids ‚Äî –ß–∞—Ç
   Messaggi totali: 654


GPU Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21/21 [00:04<00:00,  4.80it/s]


   ‚úÖ Salvato: Results_CSV/HackDroids_‚Äî_–ß–∞—Ç.csv (255 minacce)

üìÇ Gruppo: VirusCheck Chat
   Messaggi totali: 18505


GPU Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 579/579 [01:30<00:00,  6.43it/s]


   ‚úÖ Salvato: Results_CSV/VirusCheck_Chat.csv (5838 minacce)

üèÅ Finito! Controlla la cartella 'Results_CSV'
