##### Importações

In [6]:
import pandas as pd
import numpy as np
import math
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from scipy.stats import gaussian_kde
from plotly.subplots import make_subplots

In [7]:
df = pd.read_csv("../data/transformed/transformed_cybersecurity_intrusion_data.csv")

In [8]:
df.head()

Unnamed: 0,session_id,network_packet_size,protocol_type,login_attempts,session_duration,encryption_used,ip_reputation_score,failed_logins,browser_type,unusual_time_access,attack_detected,long_session,protocol_encrypt,large_packet,risk_score
0,SID_00001,599.0,TCP,4.0,492.983263,DES,0.606818,1.0,Edge,0.0,1.0,0,TCP_DES,1,0.724773
1,SID_00002,472.0,TCP,3.0,1557.996461,DES,0.301569,0.0,Firefox,0.0,0.0,1,TCP_DES,0,0.211098
2,SID_00003,629.0,TCP,3.0,75.044262,DES,0.739164,2.0,Chrome,0.0,1.0,0,TCP_DES,1,1.117415
3,SID_00005,453.0,TCP,5.0,532.540888,AES,0.054874,1.0,Firefox,0.0,0.0,0,TCP_AES,0,0.338412
4,SID_00006,453.0,UDP,5.0,380.47155,AES,0.422486,2.0,Chrome,1.0,0.0,0,UDP_AES,0,0.89574


##### Análise Univariada

In [9]:
pio.templates.default = "plotly_dark"

In [10]:
categorical_cols = ["protocol_type", "encryption_used", "browser_type", "unusual_time_access", "attack_detected", "long_session", "protocol_encrypt", "large_packet", "failed_logins",]

numerical_cols = ["network_packet_size", "login_attempts", "session_duration", "ip_reputation_score", "risk_score"]

In [None]:
def plot_multiple_categorical(df, cols):
    n = len(cols)
    cols_per_row = 3
    rows = math.ceil(n / cols_per_row)

    fig = make_subplots(
        rows=rows, 
        cols=cols_per_row,
        subplot_titles=[col.replace("_", " ").title() for col in cols],
        horizontal_spacing=0.08,
        vertical_spacing=0.12
    )

    palette = px.colors.qualitative.Bold  

    for i, col in enumerate(cols):
        row = i // cols_per_row + 1
        col_pos = i % cols_per_row + 1

        counts = df[col].value_counts().reset_index()
        counts.columns = [col, "count"]
        counts["percent"] = counts["count"] / counts["count"].sum() * 100

        fig.add_trace(
            go.Bar(
                x=counts[col],
                y=counts["count"],
                text=[f"{c} ({p:.1f}%)" for c, p in zip(counts["count"], counts["percent"])],
                textposition="auto",
                marker_color=palette[i % len(palette)],
                name=col.replace("_", " ").title()
            ),
            row=row, col=col_pos
        )

    fig.update_layout(
        height=400*rows, 
        width=2100,
        title_text="Distribuição de Variáveis Categóricas",
        title_x=0.5,
        title_font=dict(size=24),
        legend_title="Variável",
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=-0.2,
            xanchor="center",
            x=0.5
        ),
        margin=dict(l=50, r=50, t=80, b=50)
    )

    fig.show()

In [59]:
def plot_multiple_numerical(df, cols):
    n = len(cols)
    cols_per_row = 3
    rows = math.ceil(n / cols_per_row)

    fig = make_subplots(
        rows=rows, 
        cols=cols_per_row,
        subplot_titles=[
            f"{col.replace('_', ' ').title()}<br>Média: {df[col].mean():.2f} | Mediana: {df[col].median():.2f}"
            for col in cols
        ],
        horizontal_spacing=0.08,
        vertical_spacing=0.25
    )

    palette = px.colors.qualitative.Dark24  

    for i, col in enumerate(cols):
        row = i // cols_per_row + 1
        col_pos = i % cols_per_row + 1

        fig.add_trace(
            go.Histogram(
                x=df[col],
                nbinsx=100,
                marker_color=palette[i % len(palette)],
                name=col.replace("_", " ").title(),
                opacity=0.75
            ),
            row=row, col=col_pos
        )

        try:
            kde = gaussian_kde(df[col].dropna())
            x_range = np.linspace(df[col].min(), df[col].max(), 200)
            fig.add_trace(
                go.Scatter(
                    x=x_range,
                    y=kde(x_range) * len(df[col]) * (df[col].max()-df[col].min()) / 100,  
                    mode="lines",
                    line=dict(color="white", width=2),
                    name=f"Densidade {col.replace('_', ' ').title()}",
                    showlegend=False
                ),
                row=row, col=col_pos
            )
        except Exception as e:
            pass  
        
    fig.update_layout(
        height=450*rows,
        width=2100,
        title_text="Distribuição de Variáveis Numéricas",
        title_x=0.5,
        title_font=dict(size=24),
        legend_title="Variável",
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=-0.2,
            xanchor="center",
            x=0.5
        ),
        margin=dict(l=50, r=50, t=80, b=50)
    )

    fig.show()

In [56]:
plot_multiple_categorical(df, categorical_cols)

In [60]:
plot_multiple_numerical(df, numerical_cols)

In [45]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=("Network Packet Size", "Login Attempts", "Session Duration")
)

colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]

fig.add_trace(go.Box(y=df["network_packet_size"], x=df["large_packet"],
                     name="Network Packet Size", boxmean='sd', marker_color=colors[0],
                     hoverinfo='x+y+name'), row=1, col=1)

fig.add_trace(go.Box(y=df["login_attempts"], x=df["attack_detected"],
                     name="Login Attempts", boxmean='sd', marker_color=colors[1],
                     hoverinfo='x+y+name'), row=1, col=2)

fig.add_trace(go.Box(y=df["session_duration"], x=df["long_session"],
                     name="Session Duration", boxmean='sd', marker_color=colors[2],
                     hoverinfo='x+y+name'), row=1, col=3)

fig.update_layout(
    template="plotly_dark", 
    height=600, width=1800,
    showlegend=False,
    title_text="Distribuições das Métricas: Tamanho de Pacote da Rede, Tentivas de Login e Duração de Sessão de Acordo com a Detecção ou Ausência de Ataque",
    title_x=0.5,
    font=dict(family="Arial", size=12, color="white"),
    margin=dict(l=40, r=40, t=80, b=40)
)

for i in range(1, 4):
    fig.update_xaxes(title_text="", row=1, col=i, showgrid=True, gridcolor="gray")
    fig.update_yaxes(title_text="Valor", row=1, col=i, showgrid=True, gridcolor="gray")

fig.show()

In [71]:
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("IP Reputation vs Risk", "Failed Logins vs Risk")
)

fig.add_trace(
    go.Scatter(
        x=df["ip_reputation_score"], 
        y=df["risk_score"],
        mode='markers',
        name="IP Reputation",
        marker=dict(color="#1f77b4", size=8, opacity=0.7, line=dict(width=0.5, color="white")),
        hovertemplate="IP Reputation: %{x}<br>Risk Score: %{y}<extra></extra>"
    ),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(
        x=df["failed_logins"], 
        y=df["risk_score"],
        mode='markers',
        name="Failed Logins",
        marker=dict(color="#ff7f0e", size=8, opacity=0.7, line=dict(width=0.5, color="white")),
        hovertemplate="Failed Logins: %{x}<br>Risk Score: %{y}<extra></extra>"
    ),
    row=1, col=2
)

fig.update_layout(
    template="plotly_dark",
    height=600, width=1800,
    showlegend=False,
    title_text="Reputação do IP e Tentativas Falhas de Login vs Score de Risco",
    title_x=0.5,
    font=dict(family="Arial", size=12, color="white"),
    margin=dict(l=40, r=40, t=80, b=40)
)

for i in range(1, 3):
    fig.update_xaxes(title_text="Valor da Métrica", row=1, col=i, showgrid=True, gridcolor="gray")
    fig.update_yaxes(title_text="Score de Risco", row=1, col=i, showgrid=True, gridcolor="gray")

fig.show()

In [72]:
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Failed Logins vs Attack", "Risk Score vs Attack")
)

colors = ["#ff7f0e", "#1f77b4"]

fig.add_trace(
    go.Violin(
        y=df["failed_logins"], x=df["attack_detected"],
        box_visible=True, meanline_visible=True,
        name="Failed Logins",
        line_color=colors[0],
        fillcolor=colors[0],
        opacity=0.7,
        hovertemplate="Attack Detected: %{x}<br>Failed Logins: %{y}<extra></extra>"
    ),
    row=1, col=1
)

fig.add_trace(
    go.Violin(
        y=df["risk_score"], x=df["attack_detected"],
        box_visible=True, meanline_visible=True,
        name="Risk Score",
        line_color=colors[1],
        fillcolor=colors[1],
        opacity=0.7,
        hovertemplate="Attack Detected: %{x}<br>Risk Score: %{y}<extra></extra>"
    ),
    row=1, col=2
)

fig.update_layout(
    template="plotly_dark",
    height=600, width=1800,
    showlegend=False,
    title_text="Distribuição de Tentativas Falhas de Login e Score de Risco de Acordo com a Detecção ou não de Ataque",
    title_x=0.5,
    font=dict(family="Arial", size=12, color="white"),
    margin=dict(l=40, r=40, t=80, b=40)
)

for i in range(1, 3):
    fig.update_xaxes(title_text="Attack Detected", row=1, col=i, showgrid=True, gridcolor="gray")
    fig.update_yaxes(title_text="Valor", row=1, col=i, showgrid=True, gridcolor="gray")

fig.show()