#  KONBRIEFING


## Importación de librerías

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import country_converter as coco
import matplotlib.pyplot as plt
import re
import hashlib
from urllib.parse import urlparse
import os
import json

## Visualización de los datos

In [None]:

# Intentar cargar el archivo CSV
file_path = 'KONBRIEFING.csv'

if os.path.exists(file_path):
    try:
        df = pd.read_csv(file_path)
        print("Archivo cargado exitosamente:")
        df.head() # Mostrar las primeras filas
    except Exception as e:
        print(f"Error al cargar el archivo: {e}")
else:
    print(f"Error: El archivo '{file_path}' no existe en la ubicación actual: {os.getcwd()}")



Error: El archivo 'KONBRIEFING.csv' no existe en la ubicación actual: c:\Users\lucia\OneDrive\Documentos\DeNexus_CSV-Analysis


## Limpieza de los datos

In [None]:
# Cargar el archivo CSV
data = pd.read_csv(file_path)

# Eliminar valores duplicados
data_cleaned = data.drop_duplicates()

# Eliminar filas con valores nulos
data_cleaned = data_cleaned.dropna()

# Guardar el archivo limpio
cleaned_file_path = 'KONBRIEFING_cleaned.csv'
data_cleaned.to_csv(cleaned_file_path, index=False)

# Mostrar información después de la limpieza
print(data_cleaned.info())

# Mostrar las primeras filas del archivo limpio
data_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
Index: 66894 entries, 0 to 67357
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date           66894 non-null  object
 1   title          66894 non-null  object
 2   description    66894 non-null  object
 3   references     66894 non-null  object
 4   date_uploaded  66894 non-null  object
dtypes: object(5)
memory usage: 3.1+ MB
None
             date                                              title  \
0      April 2024  Facebook page of the local branch of a politic...   
1  April 25, 2024  Cyber attack on a construction company in Germany   
2  April 25, 2024        Cyber attack on a city government in France   
3  April 25, 2024  Cyber attack on a traffic management system in...   
4  April 24, 2024  Mail account of a city government in Belgium h...   

                                         description  \
0   SPÖ Müllendorf - Müllendorf, Burgenland, Austria   
1  Max

## Tratamiento de las columnas


### Date

In [3]:
# Obtener los nombres de las columnas
column_names = data.columns.tolist()

# Mostrar los nombres de las columnas
print("Nombres de las columnas:",column_names)

Nombres de las columnas: ['date', 'title', 'description', 'references', 'date_uploaded']


In [None]:
# Verificar si la columna 'date' existe
if 'date' in data_cleaned.columns:
    print("La columna 'date' existe.")
    
    # Convertir la columna 'date' a formato de fecha, manejando posibles errores
    data_cleaned['date'] = pd.to_datetime(data_cleaned['date'], errors='coerce')
    
    # Crear columnas para Año, Mes y Día
    data_cleaned['year'] = data_cleaned['date'].dt.year
    data_cleaned['month'] = data_cleaned['date'].dt.month
    data_cleaned['day'] = data_cleaned['date'].dt.day
    
    # Eliminar la columna original 'date'
    data_cleaned = data_cleaned.drop(columns=['date'])
    
    # Mostrar una vista previa del DataFrame modificado
    data_cleaned.head()
else:
    print("La columna 'date' no existe en el DataFrame.")

La columna 'date' existe.
                                               title  \
0  Facebook page of the local branch of a politic...   
1  Cyber attack on a construction company in Germany   
2        Cyber attack on a city government in France   
3  Cyber attack on a traffic management system in...   
4  Mail account of a city government in Belgium h...   

                                         description  \
0   SPÖ Müllendorf - Müllendorf, Burgenland, Austria   
1  Max Wild GmbH - Berkheim, Baden-Württemberg, G...   
2  Ville de Gravelines - Gravelines, Hauts-de-Fra...   
3  KC Scout - Kansas City, Missouri, USA (Jackson...   
4                    Deinze, Flemish Region, Belgium   

                                          references date_uploaded    year  \
0  [{"title":"Hackerangriff auf Facebookseite der...    2024-04-29  2024.0   
1  [{"title":"Cyberangriff auf Max Wild GmbH","ur...    2024-04-29  2024.0   
2  [{"title":"Notre ville est actuellement victim...    2024-04-29

  data_cleaned['date'] = pd.to_datetime(data_cleaned['date'], errors='coerce')


In [None]:
data_cleaned.head()

                                               title  \
0  Facebook page of the local branch of a politic...   
1  Cyber attack on a construction company in Germany   
2        Cyber attack on a city government in France   
3  Cyber attack on a traffic management system in...   
4  Mail account of a city government in Belgium h...   

                                         description  \
0   SPÖ Müllendorf - Müllendorf, Burgenland, Austria   
1  Max Wild GmbH - Berkheim, Baden-Württemberg, G...   
2  Ville de Gravelines - Gravelines, Hauts-de-Fra...   
3  KC Scout - Kansas City, Missouri, USA (Jackson...   
4                    Deinze, Flemish Region, Belgium   

                                          references date_uploaded    year  \
0  [{"title":"Hackerangriff auf Facebookseite der...    2024-04-29  2024.0   
1  [{"title":"Cyberangriff auf Max Wild GmbH","ur...    2024-04-29  2024.0   
2  [{"title":"Notre ville est actuellement victim...    2024-04-29  2024.0   
3  [{"title":"

In [6]:
# Eliminar cualquier columna que represente la fecha completa si existe
if 'full_date' in data_cleaned.columns:
    data_cleaned = data_cleaned.drop(columns=['full_date'])

# Mostrar una vista previa del DataFrame después de eliminar la columna
data_cleaned.head()

Unnamed: 0,title,description,references,date_uploaded,year,month,day
0,Facebook page of the local branch of a politic...,"SPÖ Müllendorf - Müllendorf, Burgenland, Austria","[{""title"":""Hackerangriff auf Facebookseite der...",2024-04-29,2024.0,4.0,1.0
1,Cyber attack on a construction company in Germany,"Max Wild GmbH - Berkheim, Baden-Württemberg, G...","[{""title"":""Cyberangriff auf Max Wild GmbH"",""ur...",2024-04-29,2024.0,4.0,25.0
2,Cyber attack on a city government in France,"Ville de Gravelines - Gravelines, Hauts-de-Fra...","[{""title"":""Notre ville est actuellement victim...",2024-04-29,2024.0,4.0,25.0
3,Cyber attack on a traffic management system in...,"KC Scout - Kansas City, Missouri, USA (Jackson...","[{""title"":""KC Scout Alert"",""url"":""https://www....",2024-04-29,2024.0,4.0,25.0
4,Mail account of a city government in Belgium h...,"Deinze, Flemish Region, Belgium","[{""title"":""Stadsdiensten van Deinze geplaagd d...",2024-04-29,2024.0,4.0,24.0


In [7]:
# Rellenar valores nulos con 0 y convertir a enteros
data_cleaned['year'] = data_cleaned['year'].fillna(0).astype('int')
data_cleaned['month'] = data_cleaned['month'].fillna(0).astype('int')
data_cleaned['day'] = data_cleaned['day'].fillna(0).astype('int')

# Mostrar una vista previa del DataFrame modificado
data_cleaned.head()

Unnamed: 0,title,description,references,date_uploaded,year,month,day
0,Facebook page of the local branch of a politic...,"SPÖ Müllendorf - Müllendorf, Burgenland, Austria","[{""title"":""Hackerangriff auf Facebookseite der...",2024-04-29,2024,4,1
1,Cyber attack on a construction company in Germany,"Max Wild GmbH - Berkheim, Baden-Württemberg, G...","[{""title"":""Cyberangriff auf Max Wild GmbH"",""ur...",2024-04-29,2024,4,25
2,Cyber attack on a city government in France,"Ville de Gravelines - Gravelines, Hauts-de-Fra...","[{""title"":""Notre ville est actuellement victim...",2024-04-29,2024,4,25
3,Cyber attack on a traffic management system in...,"KC Scout - Kansas City, Missouri, USA (Jackson...","[{""title"":""KC Scout Alert"",""url"":""https://www....",2024-04-29,2024,4,25
4,Mail account of a city government in Belgium h...,"Deinze, Flemish Region, Belgium","[{""title"":""Stadsdiensten van Deinze geplaagd d...",2024-04-29,2024,4,24


### Title

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Crear el vectorizador TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=100)  # Limitar a las 100 características más relevantes

# Ajustar y transformar la columna 'title'
tfidf_matrix = tfidf_vectorizer.fit_transform(data_cleaned['title'].fillna(''))

# Convertir el resultado en un DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Combinar las nuevas características con el DataFrame original
data_cleaned = pd.concat([data_cleaned.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Eliminar la columna original 'title'
data_cleaned = data_cleaned.drop(columns=['title'])

# Mostrar una vista previa del DataFrame modificado
data_cleaned.head()

Unnamed: 0,description,references,date_uploaded,year,month,day,000,access,account,administration,...,the,to,uk,unauthorized,united,university,us,usa,website,websites
0,"SPÖ Müllendorf - Müllendorf, Burgenland, Austria","[{""title"":""Hackerangriff auf Facebookseite der...",2024-04-29,2024,4,1,0.0,0.0,0.0,0.0,...,0.26839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Max Wild GmbH - Berkheim, Baden-Württemberg, G...","[{""title"":""Cyberangriff auf Max Wild GmbH"",""ur...",2024-04-29,2024,4,25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Ville de Gravelines - Gravelines, Hauts-de-Fra...","[{""title"":""Notre ville est actuellement victim...",2024-04-29,2024,4,25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"KC Scout - Kansas City, Missouri, USA (Jackson...","[{""title"":""KC Scout Alert"",""url"":""https://www....",2024-04-29,2024,4,25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.401641,0.0,0.0
4,"Deinze, Flemish Region, Belgium","[{""title"":""Stadsdiensten van Deinze geplaagd d...",2024-04-29,2024,4,24,0.0,0.0,0.502403,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Description

In [None]:
from sklearn.preprocessing import OneHotEncoder
import re

# Extraer región de 'description' y crear la columna 'Region'
def extract_region(description):
    regions = ["Africa", "Asia", "Europe", "North America", "South America", "Australia", "Antarctica"]
    for region in regions:
        if re.search(region, description, re.IGNORECASE):
            return region
    return "Unknown"

data_cleaned['Region'] = data_cleaned['description'].apply(extract_region)

# Agrupar regiones por continentes
continent_map = {
    "Africa": "Africa",
    "Asia": "Asia",
    "Europe": "Europe",
    "North America": "North America",
    "South America": "South America",
    "Australia": "Australia",
    "Antarctica": "Antarctica",
    "Unknown": "Unknown"
}

data_cleaned['Continent'] = data_cleaned['Region'].map(continent_map)

# Aplicar One-Hot Encoding a 'Continent'
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')  # Argumento actualizado
continent_encoded = one_hot_encoder.fit_transform(data_cleaned[['Continent']])

# Agregar las columnas codificadas al DataFrame
continent_columns = one_hot_encoder.get_feature_names_out(['Continent'])
continent_df = pd.DataFrame(continent_encoded, columns=continent_columns)

data_cleaned = pd.concat([data_cleaned.reset_index(drop=True), continent_df.reset_index(drop=True)], axis=1)

# Eliminar columnas redundantes
data_cleaned = data_cleaned.drop(columns=['Region', 'Continent'])

# Convertir 'description' en características numéricas usando TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=100)  # Limitar a 100 características
tfidf_matrix = tfidf_vectorizer.fit_transform(data_cleaned['description'].fillna(''))
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

data_cleaned = pd.concat([data_cleaned.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Eliminar la columna original 'description'
data_cleaned = data_cleaned.drop(columns=['description'])

data_cleaned.head()

                                          references date_uploaded  year  \
0  [{"title":"Hackerangriff auf Facebookseite der...    2024-04-29  2024   
1  [{"title":"Cyberangriff auf Max Wild GmbH","ur...    2024-04-29  2024   
2  [{"title":"Notre ville est actuellement victim...    2024-04-29  2024   
3  [{"title":"KC Scout Alert","url":"https://www....    2024-04-29  2024   
4  [{"title":"Stadsdiensten van Deinze geplaagd d...    2024-04-29  2024   

   month  day  000  access   account  administration  affected  ...       usa  \
0      4    1  0.0     0.0  0.000000             0.0       0.0  ...  0.000000   
1      4   25  0.0     0.0  0.000000             0.0       0.0  ...  0.000000   
2      4   25  0.0     0.0  0.000000             0.0       0.0  ...  0.000000   
3      4   25  0.0     0.0  0.000000             0.0       0.0  ...  0.313808   
4      4   24  0.0     0.0  0.502403             0.0       0.0  ...  0.000000   

   victoria  virginia  wales  washington  westphalia  wü

### References


In [None]:
def extract_title_url(reference_column):
    titles, urls = [], []

    for ref in reference_column:
        if not ref:
            titles.append("")
            urls.append("")
            continue

        try:

            ref_json = json.loads(ref)


            if isinstance(ref_json, list) and len(ref_json) > 0:
                first_ref = ref_json[0]


                title = first_ref.get("title", "")
                url = first_ref.get("url", "")


                site_name = urlparse(url).netloc.replace('www.', '') if url else ""

                titles.append(title)
                urls.append(site_name)
            else:
                titles.append("")
                urls.append("")
        except (json.JSONDecodeError, IndexError, TypeError) as e:

            titles.append("")
            urls.append("")

    return titles, urls

data_cleaned['title_extracted'], data_cleaned['site_name_extracted'] = extract_title_url(data_cleaned['references'])

data_cleaned.head(10)


                                                   title  \
0      Hackerangriff auf Facebookseite der SPÖ Müllen...   
1                         Cyberangriff auf Max Wild GmbH   
2      Notre ville est actuellement victime d’une cyb...   
3                                         KC Scout Alert   
4      Stadsdiensten van Deinze geplaagd door cyberaa...   
...                                                  ...   
66889  Curtea de Conturi anunță că pagina web a insti...   
66890  Cyber Attack Hits Agency That Oversees Illinoi...   
66891        L’Inserm victime d’un piratage informatique   
66892  Russian defence ministry says its website hit ...   
66893                                      NCT IT Issues   

                                                     url  
0      https://www.meinbezirk.at/eisenstadt/c-lokales...  
1      https://www.maxwild.com/unternehmen/news/cyber...  
2      https://www.facebook.com/villedegravelines/pos...  
3      https://www.facebook.com/MoDOTStatew

In [None]:
data_cleaned.drop(columns=['references'], inplace=True)

In [None]:
# Columnas tratadas
data_cleaned.head()

   year  month  day  000  access   account  administration  affected  agency  \
0  2024      4    1  0.0     0.0  0.000000             0.0       0.0     0.0   
1  2024      4   25  0.0     0.0  0.000000             0.0       0.0     0.0   
2  2024      4   25  0.0     0.0  0.000000             0.0       0.0     0.0   
3  2024      4   25  0.0     0.0  0.000000             0.0       0.0     0.0   
4  2024      4   24  0.0     0.0  0.502403             0.0       0.0     0.0   

    an  ...  virginia  wales  washington  westphalia  württemberg  york  \
0  0.0  ...       0.0    0.0         0.0         0.0     0.000000   0.0   
1  0.0  ...       0.0    0.0         0.0         0.0     0.474777   0.0   
2  0.0  ...       0.0    0.0         0.0         0.0     0.000000   0.0   
3  0.0  ...       0.0    0.0         0.0         0.0     0.000000   0.0   
4  0.0  ...       0.0    0.0         0.0         0.0     0.000000   0.0   

   zealand  île                                              title  

## Variables categóricas