In [1]:
import pandas as pd
import requests
import time
import os
import base64
from dotenv import load_dotenv

load_dotenv()

VT_API_KEY = os.getenv("VIRUS_API_KEY")

# ================= CONFIGURAZIONE =================
INPUT_CSV = "IOC_DATASET.csv"
OUTPUT_CSV = "ENRICHED_IOC_DATASET.csv" # Il file finale per i grafici

# Tipi da analizzare
TARGET_TYPES = ['ip4', 'md5', 'sha1', 'sha256', 'url', 'fqdn']

# ================= FUNZIONI =================
def get_vt_type(row_type):
    if row_type in ['ip4']: return 'ip_addresses'
    elif row_type in ['md5', 'sha1', 'sha256', 'hash']: return 'files'
    elif row_type == 'url': return 'urls'
    elif row_type in ['fqdn']: return 'domains'
    return None

def check_virustotal(ioc, ioc_type):
    vt_type = get_vt_type(ioc_type)
    if not vt_type: return None

    endpoint = ioc
    if vt_type == 'urls':
        endpoint = base64.urlsafe_b64encode(ioc.encode()).decode().strip("=")

    url = f"https://www.virustotal.com/api/v3/{vt_type}/{endpoint}"
    headers = {"x-apikey": VT_API_KEY}

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()['data']['attributes']
            stats = data['last_analysis_stats']
            return {
                "ioc_value": ioc, # Chiave per il merge
                "vt_malicious": stats['malicious'],
                "vt_total_engines": sum(stats.values()),
                "vt_scan_date": data.get('last_analysis_date', 0),
                "vt_permalink": response.json()['data']['links']['self']
            }
        elif response.status_code == 429: return "RATE_LIMIT"
        elif response.status_code == 404:
            return {"ioc_value": ioc, "vt_malicious": 0, "vt_total_engines": 0, "vt_status": "not_found"}
        else:
            return None
    except Exception:
        return None

# ================= MAIN =================
print("üìÇ Caricamento Dataset originale...")
df = pd.read_csv(INPUT_CSV)

# 1. Filtro iniziale
df_filtered = df[df['ioc_type'].isin(TARGET_TYPES)].copy()

# 2. Selezioniamo i Top 500 Unici (la logica che abbiamo deciso prima)
df_unique_targets = df_filtered.sort_values(by='cyber_score', ascending=False).drop_duplicates(subset=['ioc_value']).head(500)

print(f"üéØ Target unici da scansionare: {len(df_unique_targets)}")

# Lista per accumulare i risultati
vt_results_list = []

# 3. Scansione
count = 0
for index, row in df_unique_targets.iterrows():
    ioc = row['ioc_value']
    itype = row['ioc_type']
    
    print(f"[{count+1}/500] Scanning: {ioc}...", end=" ", flush=True)
    
    result = check_virustotal(ioc, itype)
    
    # Gestione Rate Limit
    if result == "RATE_LIMIT":
        print("‚è≥ RATE LIMIT. Pausa 65s...")
        time.sleep(65)
        result = check_virustotal(ioc, itype)
    
    if isinstance(result, dict):
        if result.get('vt_malicious', 0) > 0:
            print(f"üö® POSITIVO ({result['vt_malicious']})")
        else:
            print("‚úÖ CLEAN")
        vt_results_list.append(result)
    else:
        print("‚ö†Ô∏è Errore/Skip")
        
    count += 1
    time.sleep(15) # Pausa Free Tier

# ================= MERGE E SALVATAGGIO =================
print("\nüîÑ Unione dei dati VirusTotal con il contesto originale...")

# Creiamo un DataFrame solo con i risultati VT
df_vt = pd.DataFrame(vt_results_list)

# Facciamo il MERGE: uniamo il dataset originale filtrato con i risultati VT
# Usiamo 'inner' per tenere solo le righe che abbiamo effettivamente scansionato (le top 500 e i loro duplicati nel contesto)
df_final = pd.merge(df_filtered, df_vt, on='ioc_value', how='inner')

# Salviamo il nuovo CSV ricco
df_final.to_csv(OUTPUT_CSV, index=False)

print(f"üéâ Fatto! Dataset salvato in: {OUTPUT_CSV}")
print(f"   Righe originali (scansionate + contesto): {len(df_final)}")
print("   Ora puoi usare questo file per i tuoi grafici!")

üìÇ Caricamento Dataset originale...
üéØ Target unici da scansionare: 500
[1/500] Scanning: https://doxbin.com/upload/DOXXING20GUIDE... üö® POSITIVO (5)
[2/500] Scanning: 92.28.211.234... ‚úÖ CLEAN
[3/500] Scanning: host-66.120.12.111.ucom.com... ‚úÖ CLEAN
[4/500] Scanning: sof02s32-in-f14.1e100.net... ‚úÖ CLEAN
[5/500] Scanning: host-132.12.32.167.ucom.com... ‚úÖ CLEAN
[6/500] Scanning: ss://YWVzLTI1Ni1nY206ZTRGQ1dyZ3BramkzUVk=:None@23.154.136.149:9102#%40daredevill_404... ‚úÖ CLEAN
[7/500] Scanning: www.hetzner.com... ‚úÖ CLEAN
[8/500] Scanning: https://github.com/xai-org/grok-1... ‚úÖ CLEAN
[9/500] Scanning: vless://telegram-id-ArV2ray:None@185.146.173.58:80?path=%2Ftelegram-ArV2ray%2Ctelegram-ArV2ray%2Ctelegram-ArV2ray%2Ctelegram-ArV2ray%2Ctelegram-ArV2ray%2Ctelegram-ArV2ray%2Ctelegram-ArV2ray%2Ctelegram-ArV2ray%2Ctelegram-ArV2ray%2Ctelegram-ArV2ray%2Ctelegram-ArV2ray%2Ctelegram-ArV2ray%3Fed%3D2056&security=none&encryption=none&host=arv2ray.5.arv2ray.ir.nedayevarzesh.ir.newsless