In [30]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from tf_keras.callbacks import EarlyStopping
from tf_keras.optimizers import Adam
import ast
import keras as keras

# Descargar recursos de NLTK si no los tienes
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:


def categorizar_articulo_subtemas_en(texto, palabras_clave, vectorizer, umbrales, top_n=3):
    """Categoriza un artículo en subtemas, con umbrales personalizados por subtema."""
    try:
        texto = texto.lower()
        texto = re.sub(r'[^\w\s]', '', texto)  # Elimina puntuación
        stop_words = set(stopwords.words('english'))  # Stopwords en inglés
        palabras = [palabra for palabra in texto.split() if palabra not in stop_words]

        # Lematización (opcional)
        lemmatizer = WordNetLemmatizer()
        palabras = [lemmatizer.lemmatize(palabra) for palabra in palabras]


        texto_limpio = " ".join(palabras)  # Une las palabras limpias en un texto
        vector_texto = vectorizer.transform([texto_limpio])  # Convierte el texto en un vector


        similitudes = {}
        for subtema, palabras_clave in palabras_clave.items():
            vector_palabras_clave = vectorizer.transform([" ".join(palabras_clave)])
            similitudes[subtema] = cosine_similarity(vector_texto, vector_palabras_clave)[0][0]

        # Ordena las categorías por similitud de mayor a menor
        categorias_ordenadas = sorted(similitudes.items(), key=lambda x: x[1], reverse=True)
        print(categorias_ordenadas)




        # Selecciona las N categorías principales que superan el umbral
        top_categorias = []
        for categoria, similitud in categorias_ordenadas:
            umbral_categoria = umbrales.get(categoria, 0.1)  # Busca el umbral específico o usa 0.1 por defecto
            if similitud >= umbral_categoria:
                top_categorias.append(categoria)

        if not top_categorias:
            top_categorias.append("No specific subtopic")

        return top_categorias[:top_n]  # Retorna las N categorías principales

    except Exception as e:
        print(f"Error al procesar el texto: {e}")
        return ["Error"]  # Devuelve "Error" en caso de excepción


# Carga del CSV (¡ADAPTA LA RUTA!)
try:
    df = pd.read_csv("clean_data.csv", encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv("other_data.csv", encoding='latin-1') # o 'ISO-8859-1'

# Definir palabras clave para cada subtema (¡ADAPTA ESTO!)
subtemas_palabras_clave = {
    "Malware": [
        "virus", "worm", "trojan", "rootkit", "spyware", "adware", "keylogger", "malicious code",
        "payload", "botnet", "exploit kit", "fileless malware", "polymorphic malware", "ransomware"
    ],
    "Ransomware": [
        "ransomware", "kidnapping", "ransom", "cryptolocker", "decrypt", "encryption",
        "locker ransomware", "crypto-ransomware", "double extortion", "data hostage", "ransom note", "payment demand"
    ],
    "Phishing": [
        "phishing", "spoofing", "scam", "email", "malicious link", "credential theft", "fraudulent",
        "account", "password", "login", "verify", "suspicious", "security", "alert", "warning",
        "sensitive", "update", "confirm", "unexpected", "urgent", "unauthorized", "suspicious activity",
        "click here", "safeguard", "risk", "free", "limited time", "offer", "act now", "bank account",
        "transfer", "secure your account", "immediately", "suspended", "blocked", "unusual activity",
        "click this link", "incorrect", "password reset", "customer support", "technical support",
        "update your info", "danger", "incomplete", "unclaimed", "recover", "payment issue", "unpaid",
        "compromise", "authenticate", "validation", "claim now", "prize", "recovery", "security breach",
        "inbox", "suspicious email", "confirm your identity", "personal information", "breach", "click to confirm"
    ],
    "Vulnerabilities": [
        "vulnerability", "exploit", "patch", "zero-day", "CVE", "bug", "flaw", "weakness", "attack",
        "breach", "compromise", "buffer overflow", "SQL injection", "cross-site scripting", "XSS",
        "remote code execution", "RCE", "privilege escalation", "denial of service", "DoS",
        "distributed denial of service", "DDoS", "man-in-the-middle", "MITM", "malware", "trojan",
        "ransomware", "rootkit", "backdoor", "access control", "credential stuffing", "password cracking",
        "phishing attack", "social engineering", "brute force", "exploit code", "shellshock", "heartbleed",
        "log4j", "security hole", "patch management", "unauthorized access", "security flaw", "code injection",
        "memory corruption", "session hijacking", "insider threat", "CVSS", "security misconfiguration",
        "unpatched", "unsecured", "insecure", "vulnerable", "hacker", "malicious", "exploit kit", "spoofing",
        "root access", "backdoor access", "cyberattack", "XSRF", "CSRF", "unauthorized privilege",
        "unauthenticated", "exposed port", "leak", "unverified", "patch bypass", "security loophole", "signature bypass"
    ],
    "Attacks": [
        "attack", "hacker", "denial of service", "DDoS", "intrusion", "breach", "cyberattack", "cybercrime",
        "brute force", "social engineering", "credential stuffing", "man-in-the-middle", "MITM",
        "zero-day exploit", "advanced persistent threat", "APT", "insider threat", "session hijacking"
    ],
    "Privacy": [
        "privacy", "personal data", "GDPR", "data protection", "consent", "tracking", "surveillance",
        "data breach", "encryption", "anonymity", "data minimization", "right to be forgotten",
        "cookie consent", "data sovereignty", "data leakage", "identity theft"
    ],
    "tips": [
        "tip", "recommendation", "guide", "tutorial", "best practices", "security awareness",
        "password hygiene", "multi-factor authentication", "MFA", "backup strategy", "incident response",
        "security training", "phishing prevention", "secure browsing", "firewall configuration", "regular updates"
    ],
    "Software": [
        "software", "program", "application", "operating system", "code", "script", "firmware",
        "open source", "proprietary software", "software development", "SDLC", "version control",
        "debugging", "API security", "containerization", "virtualization", "cloud software", "end-of-life software"
    ]
}


umbrales_personalizados = {
    "Malware": 0.01,
    "Ransomware": 0.01,
    "Phishing": 0.001,
    "Vulnerabilities": 0.03,
    "Attacks": 0.04,
    "Privacy": 0.01,
    "tips": 0.01,
    "Software": 0.03
}



# Preprocesamiento y Vectorización con TF-IDF
vectorizer = TfidfVectorizer()
corpus = df["text"].tolist()  # Lista de todos los textos
vectorizer.fit(corpus)  # Ajusta el vectorizador al corpus

# Aplica la función de categorización a cada artículo
df["label"] = df["text"].apply(lambda x: categorizar_articulo_subtemas_en(x, subtemas_palabras_clave, vectorizer, umbrales_personalizados, top_n=3))

# Guarda el DataFrame con las categorías (¡ADAPTA LA RUTA!)
df.to_csv("labelled_data.csv", index=False, encoding='utf-8')

print("Categorización completada. Archivo guardado como noticias_tecnologia_subtemas_en.csv")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[('Ransomware', 0.16626516205117295), ('Malware', 0.05636130854978169), ('Vulnerabilities', 0.0245362639554904), ('Phishing', 0.016291708209638166), ('Software', 0.012195379802705734), ('tips', 0.006687641891640585), ('Attacks', 0.0014144770631279856), ('Privacy', 0.0)]
[('Software', 0.030840507841103138), ('Phishing', 0.028814324913602047), ('Privacy', 0.021623404458070702), ('Vulnerabilities', 0.016636494937499472), ('Malware', 0.00889202957650127), ('Attacks', 0.007229886492352144), ('Ransomware', 0.00698350211400657), ('tips', 0.00381852717852427)]
[('Malware', 0.05062179144584677), ('Software', 0.04049448870527976), ('Vulnerabilities', 0.028253545709454), ('Attacks', 0.014192673563173214), ('tips', 0.012792047178693686), ('Phishing', 0.01265289799320389), ('Privacy', 0.004463787404513835), ('Ransomware', 0.0)]
[('Ransomware', 0.2606512235436881), ('Malware', 0.08581143800523654), ('Vulnerabilities', 0.044232378491987

In [29]:


keras.backend.clear_session()


#df = pd.read_csv('clean_data.csv')
df = pd.read_csv('/content/labelled_data.csv')
# Convertir etiquetas a listas
#df['label'] = df['text'].apply(categorize)

df["label"] = df["label"].apply(ast.literal_eval)



# Binarizar las etiquetas
mlb = MultiLabelBinarizer()
etiquetas_binarias = mlb.fit_transform(df['label'])
etiquetas_binarias_df = pd.DataFrame(etiquetas_binarias, columns=mlb.classes_)

etiquetas_binarias_df.to_csv('binarias.csv', index=False)
posi = etiquetas_binarias_df.sum()
print(posi)

# Dividir los datos en características (X) y etiquetas (y)
X = df['text']
y = etiquetas_binarias_df

# Dividir los datos en entrenamiento y prueba
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenizar el texto
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')

train_encodings = tokenize_function(x_train)
val_encodings = tokenize_function(x_test)

# Cargar el modelo BERT preentrenado
#config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=y.shape[1], dropout=0.1)
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=y.shape[1])
# Crear un Dataset de TensorFlow
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).shuffle(1000).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), y_test)).batch(32)

def f1_score(y_true, y_pred):
    y_pred = tf.round(y_pred)
    true_positives = tf.reduce_sum(tf.cast(y_true * y_pred, tf.float32))
    predicted_positives = tf.reduce_sum(tf.cast(y_pred, tf.float32))
    possible_positives = tf.reduce_sum(tf.cast(y_true, tf.float32))

    precision = true_positives / (predicted_positives + tf.keras.backend.epsilon())
    recall = true_positives / (possible_positives + tf.keras.backend.epsilon())
    f1 = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())
    return f1

def recall(y_true, y_pred):
    y_pred = tf.round(y_pred)
    true_positives = tf.reduce_sum(tf.cast(y_true * y_pred, tf.float32))
    possible_positives = tf.reduce_sum(tf.cast(y_true, tf.float32))
    recall = true_positives / (possible_positives + tf.keras.backend.epsilon())
    return recall

# Compilar el modelo con binary_crossentropy
optimizer = Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy',f1_score,recall])

es=EarlyStopping(monitor='val_loss',patience=5, restore_best_weights=True)



Attacks                  921
Malware                 2165
No specific subtopic     145
Phishing                4220
Privacy                 1335
Ransomware               632
Software                1337
Vulnerabilities         3501
tips                     836
dtype: int64


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
model.fit(train_dataset, validation_data=val_dataset, validation_split = 0.2, epochs=7, callbacks=[es])


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tf_keras.src.callbacks.History at 0x7afed46bc4d0>

In [32]:
model.save('trained_model', save_format='tf')

In [6]:
dfdf = pd.read_csv('labelled_data.csv')
print(dfdf)

                                                   text  \
0     Cloud infrastructure security company Wiz on T...   
1     ShinyHunters a notorious cybercriminal undergr...   
2     Even as a massive data breach affecting Air In...   
3     Bolstering password policies in your organizat...   
4     Protection against insider risks works when th...   
...                                                 ...   
5881  The US Intelligence Agency NSA has been report...   
5882   CNNFootball has never been just a hobby for J...   
5883  Story highlightsCNN Heroes 10 Years On is cele...   
5884  The secure messaging app used by staffers in t...   
5885  Domain name registrar and website hosting prov...   

                                           label  
0     ['Phishing', 'Vulnerabilities', 'Privacy']  
1             ['Attacks', 'Privacy', 'Phishing']  
2      ['Attacks', 'Privacy', 'Vulnerabilities']  
3        ['tips', 'Phishing', 'Vulnerabilities']  
4            ['Privacy', 'Software',