##### Importações

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats

In [3]:
df = pd.read_csv("../data/transformed/transformed_cybersecurity_intrusion_data.csv")   

In [4]:
df.head()

Unnamed: 0,session_id,network_packet_size,protocol_type,login_attempts,session_duration,encryption_used,ip_reputation_score,failed_logins,browser_type,unusual_time_access,attack_detected,long_session,protocol_encrypt,large_packet,risk_score
0,SID_00001,599.0,TCP,4.0,492.983263,DES,0.606818,1.0,Edge,0.0,1.0,0,TCP_DES,1,0.724773
1,SID_00002,472.0,TCP,3.0,1557.996461,DES,0.301569,0.0,Firefox,0.0,0.0,1,TCP_DES,0,0.211098
2,SID_00003,629.0,TCP,3.0,75.044262,DES,0.739164,2.0,Chrome,0.0,1.0,0,TCP_DES,1,1.117415
3,SID_00005,453.0,TCP,5.0,532.540888,AES,0.054874,1.0,Firefox,0.0,0.0,0,TCP_AES,0,0.338412
4,SID_00006,453.0,UDP,5.0,380.47155,AES,0.422486,2.0,Chrome,1.0,0.0,0,UDP_AES,0,0.89574


##### Análise Estatística

In [5]:
df_numeric = df.select_dtypes(include=[np.number])

In [28]:
print(f"Number of rows: {df.shape[0]}, Number of columns: {df.shape[1]}")

Number of rows: 7174, Number of columns: 15


In [24]:
def describe_plus(df, cmap="Blues", decimals=2):
    df_num = df.select_dtypes(include=[np.number])
    
    extra = pd.DataFrame({
        "median": df_num.median(),
        "iqr": df_num.quantile(0.75) - df_num.quantile(0.25),
        "cv": df_num.std() / df_num.mean(),
        "skew": df_num.skew(),
        "kurtosis": df_num.kurtosis(),
        "amplitude": df_num.max() - df_num.min(),
        "coef_var": df_num.apply(lambda x: stats.variation(x.dropna()))
    })
    summary = pd.concat([df_num.describe().T, extra], axis=1)
    
    return summary.style.background_gradient(cmap=cmap).format(f"{{:.{decimals}f}}")

In [25]:
describe_plus(df, cmap="plasma")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,median,iqr,cv,skew,kurtosis,amplitude,coef_var
network_packet_size,7174.0,499.98,198.35,64.0,364.0,500.0,633.0,1270.0,500.0,269.0,0.4,0.09,-0.18,1206.0,0.4
login_attempts,7174.0,4.03,1.97,1.0,3.0,4.0,5.0,13.0,4.0,2.0,0.49,0.59,0.35,12.0,0.49
session_duration,7174.0,788.45,789.41,0.5,230.58,546.85,1102.11,7190.39,546.85,871.53,1.0,2.1,6.92,7189.89,1.0
ip_reputation_score,7174.0,0.33,0.18,0.0,0.19,0.31,0.46,0.92,0.31,0.27,0.54,0.46,-0.41,0.92,0.54
failed_logins,7174.0,1.52,1.03,0.0,1.0,1.0,2.0,5.0,1.0,1.0,0.68,0.4,-0.18,5.0,0.68
unusual_time_access,7174.0,0.15,0.36,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.36,1.93,1.73,1.0,2.36
attack_detected,7174.0,0.43,0.49,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.16,0.3,-1.91,1.0,1.16
long_session,7174.0,0.5,0.5,0.0,0.0,0.5,1.0,1.0,0.5,1.0,1.0,0.0,-2.0,1.0,1.0
large_packet,7174.0,0.5,0.5,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.0,-2.0,1.0,1.0
risk_score,7174.0,0.69,0.34,0.01,0.44,0.68,0.91,1.99,0.68,0.47,0.49,0.35,-0.1,1.98,0.49


In [55]:
corr_matrix = df_numeric.corr(method='pearson')
corr_matrix.style.background_gradient(cmap="coolwarm").format("{:.3f}")

Unnamed: 0,network_packet_size,login_attempts,session_duration,ip_reputation_score,failed_logins,unusual_time_access,attack_detected,long_session,large_packet,risk_score
network_packet_size,1.0,-0.004,0.022,0.013,-0.002,-0.0,-0.005,0.008,0.801,0.003
login_attempts,-0.004,1.0,0.02,0.001,-0.021,0.013,0.264,0.021,0.002,-0.019
session_duration,0.022,0.02,1.0,0.003,0.033,0.009,0.055,0.689,0.021,0.031
ip_reputation_score,0.013,0.001,0.003,1.0,0.013,-0.013,0.225,-0.008,0.013,0.384
failed_logins,-0.002,-0.021,0.033,0.013,1.0,0.01,0.381,0.008,0.001,0.929
unusual_time_access,-0.0,0.013,0.009,-0.013,0.01,1.0,0.008,0.007,0.017,0.005
attack_detected,-0.005,0.264,0.055,0.225,0.381,0.008,1.0,0.014,-0.001,0.435
long_session,0.008,0.021,0.689,-0.008,0.008,0.007,0.014,1.0,0.006,0.004
large_packet,0.801,0.002,0.021,0.013,0.001,0.017,-0.001,0.006,1.0,0.005
risk_score,0.003,-0.019,0.031,0.384,0.929,0.005,0.435,0.004,0.005,1.0


In [53]:
df_correlation = pd.read_csv("../data/processed/processed_cybersecurity_intrusion_data.csv")

In [41]:
def normality_test_fluxogram(series, alpha=0.05):
    x = series.dropna()
    n = len(x)
    if n < 3:
        return {'teste': None, 'estatistica': None, 'p_valor': None,
                'interpretacao': 'Amostra muito pequena para testes de normalidade.'}
    
    q1 = np.percentile(x, 25)
    q3 = np.percentile(x, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5*iqr
    upper_bound = q3 + 1.5*iqr
    n_outliers = ((x < lower_bound) | (x > upper_bound)).sum()
    prop_outliers = n_outliers / n
    
    muitos_outliers = prop_outliers > 0.01

    if muitos_outliers:
        try:
            stat, pval = stats.lilliefors(x, dist='norm')
            teste = 'Lilliefors (KS ajustado)'
            result = None
        except:
            result = stats.anderson(x, dist='norm')
            stat = result.statistic
            pval = None
            teste = 'Anderson-Darling'
    else:
        if n < 50:
            stat, pval = stats.shapiro(x)
            teste = 'Shapiro-Wilk'
            result = None
        elif 50 <= n <= 5000:
            stat, pval = stats.shapiro(x)
            teste = 'Shapiro-Wilk'
            result = None
        else:
            stat, pval = stats.normaltest(x)
            teste = 'DAgostino-Pearson'
            result = None

    if pval is None:
        crit_5pct = None
        if result is not None:
            for cv, sl in zip(result.critical_values, result.significance_level):
                if sl == 5:
                    crit_5pct = cv
                    break
        if crit_5pct is not None and stat < crit_5pct:
            interpret = 'Provavelmente normal (não rejeita H0 no nível 5%)'
        else:
            interpret = 'Provavelmente não normal (rejeita H0 no nível 5%)'
        pval_str = 'N/A'

    else:
        pval_str = f'{pval:.4f}'
        if pval > alpha:
            interpret = 'Provavelmente normal (não rejeita H0)'
        else:
            interpret = 'Provavelmente não normal (rejeita H0)'

    return {'teste': teste,
            'estatistica': stat,
            'p_valor': pval_str,
            'interpretacao': interpret}

In [42]:
for col in df.select_dtypes(include=[np.number]).columns:
    print(f'--- Coluna: {col} ---')
    resultado = normality_test_fluxogram(df[col], alpha=0.05)
    for k,v in resultado.items():
        print(f'{k}: {v}')
    print('\n')

--- Coluna: network_packet_size ---
teste: DAgostino-Pearson
estatistica: 21.51383340771568
p_valor: 0.0000
interpretacao: Provavelmente não normal (rejeita H0)


--- Coluna: login_attempts ---
teste: Anderson-Darling
estatistica: 104.03917350582924
p_valor: N/A
interpretacao: Provavelmente não normal (rejeita H0 no nível 5%)


--- Coluna: session_duration ---
teste: Anderson-Darling
estatistica: 331.29627471515414
p_valor: N/A
interpretacao: Provavelmente não normal (rejeita H0 no nível 5%)


--- Coluna: ip_reputation_score ---
teste: DAgostino-Pearson
estatistica: 311.7387739840571
p_valor: 0.0000
interpretacao: Provavelmente não normal (rejeita H0)


--- Coluna: failed_logins ---
teste: Anderson-Darling
estatistica: 289.68962874121644
p_valor: N/A
interpretacao: Provavelmente não normal (rejeita H0 no nível 5%)


--- Coluna: unusual_time_access ---
teste: Anderson-Darling
estatistica: 2116.62860675278
p_valor: N/A
interpretacao: Provavelmente não normal (rejeita H0 no nível 5%)


--