In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler




# Lodaing the df_log_clipped.csv data frame 
df = pd.read_csv('df_log_clipped.csv')

# Checking the data
df.head()
df.shape

(2027332, 43)

In [2]:
df.dtypes

proto                object
state                object
dur                 float64
sbytes                int64
dbytes                int64
sttl                  int64
dttl                  int64
sloss                 int64
dloss                 int64
service              object
sload               float64
dload               float64
spkts                 int64
dpkts                 int64
swin                  int64
dwin                  int64
stcpb                 int64
dtcpb                 int64
smean                 int64
dmean                 int64
trans_depth           int64
res_bdy_len           int64
sjit                float64
djit                float64
Src_pkt_AT          float64
Dst_pkt_AT          float64
tcprtt              float64
synack              float64
ackdat              float64
is_sm_ips_ports       int64
ct_state_ttl          int64
ct_flw_http_mthd      int64
is_ftp_login          int64
ct_ftp_cmd            int64
ct_srv_src            int64
ct_srv_dst          

In [3]:
# One-Hot Encoding (OHE) is usually better for Linear Regression.
# Because LR can't interpret integers as categories automatically.
# One-hot creates separate 0/1 columns for each category, which fits LR better.
################################################
# One Hot Encoding
################################################
 
# Identify categorical columns (excluding 'attack_cat')
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
if 'attack_cat' in categorical_cols:
    categorical_cols.remove('attack_cat')



# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)



# Quick check
print(f"Original shape: {df.shape}")
print(f"Encoded shape: {df_encoded.shape}")


Original shape: (2027332, 43)
Encoded shape: (2027332, 200)


In [6]:
# Displaying the data after it's been One Hot Encoded

pd.set_option('display.max_columns', None)
df_encoded.head()


Unnamed: 0,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,sload,dload,spkts,dpkts,swin,dwin,stcpb,dtcpb,smean,dmean,trans_depth,res_bdy_len,sjit,djit,Src_pkt_AT,Dst_pkt_AT,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label,proto_aessp3d,proto_an,proto_any,proto_argus,proto_aris,proto_arp,proto_ax25,proto_bbnrcc,proto_bna,proto_brsatmon,proto_cbt,proto_cftp,proto_chaos,proto_compaqpeer,proto_cphb,proto_cpnx,proto_crtp,proto_crudp,proto_dcn,proto_ddp,proto_ddx,proto_dgp,proto_egp,proto_eigrp,proto_emcon,proto_encap,proto_esp,proto_etherip,proto_fc,proto_fire,proto_ggp,proto_gmtp,proto_gre,proto_hmp,proto_iatp,proto_ib,proto_icmp,proto_idpr,proto_idprcmtp,proto_idrp,proto_ifmp,proto_igmp,proto_igp,proto_il,proto_inlsp,proto_ip,proto_ipcomp,proto_ipcv,proto_ipip,proto_iplt,proto_ipnip,proto_ippc,proto_ipv6,proto_ipv6frag,proto_ipv6no,proto_ipv6opts,proto_ipv6route,proto_ipxnip,proto_irtp,proto_isis,proto_isoip,proto_isotp4,proto_kryptolan,proto_l2tp,proto_larp,proto_leaf1,proto_leaf2,proto_meritinp,proto_mfensp,proto_mhrp,proto_micp,proto_mobile,proto_mtp,proto_mux,proto_narp,proto_netblt,proto_nsfnetigp,proto_nvp,proto_ospf,proto_pgm,proto_pim,proto_pipe,proto_pnni,proto_prienc,proto_prm,proto_ptp,proto_pup,proto_pvp,proto_qnx,proto_rdp,proto_rsvp,proto_rtp,proto_rvd,proto_satexpak,proto_satmon,proto_sccopmce,proto_scps,proto_sctp,proto_sdrp,proto_securevmtp,proto_sep,proto_skip,proto_sm,proto_smp,proto_snp,proto_spriterpc,proto_sps,proto_srp,proto_st2,proto_stp,proto_sunnd,proto_swipe,proto_tcf,proto_tcp,proto_tlsp,proto_tp,proto_trunk1,proto_trunk2,proto_ttp,proto_udp,proto_udt,proto_unas,proto_uti,proto_vines,proto_visa,proto_vmtp,proto_vrrp,proto_wbexpak,proto_wbmon,proto_wsn,proto_xnet,proto_xnsidp,proto_xtp,proto_zero,state_CLO,state_CON,state_ECO,state_ECR,state_FIN,state_INT,state_MAS,state_NO,state_PAR,state_REQ,state_RST,state_TST,state_TXD,state_URH,state_URN,service_dns,service_ftp,service_http,service_irc,service_none,service_pop3,service_radius,service_smtp,service_snmp,service_ssh,service_ssl
0,0.001054,132,164,31,29,0,0,500473.9375,621800.9375,2,2,0,0,0,0,66,82,0,0,0.0,0.0,0.016857,0.012916,0.0,0.0,0.0,0,0,0,0,0,3,7,1,3,0.693147,1,1,normal,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
1,0.035496,528,304,31,29,0,0,87676.08594,50480.17188,4,4,0,0,0,0,132,76,0,0,2.387938,2.458112,2.080066,2.147606,0.0,0.0,0.0,0,0,0,0,0,2,4,2,3,0.693147,1,2,normal,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,0.001118,146,178,31,29,0,0,521894.5313,636282.375,2,2,0,0,0,0,73,89,0,0,0.0,0.0,0.016857,0.012916,0.0,0.0,0.0,0,0,0,0,0,12,8,1,2,0.693147,1,1,normal,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
3,0.001208,132,164,31,29,0,0,436724.5625,542597.1875,2,2,0,0,0,0,66,82,0,0,0.0,0.0,0.042101,0.013903,0.0,0.0,0.0,0,0,0,0,0,6,9,1,1,0.693147,1,1,normal,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
4,0.001168,146,178,31,29,0,0,499572.25,609067.5625,2,2,0,0,0,0,73,89,0,0,0.0,0.0,0.004988,0.002996,0.0,0.0,0.0,0,0,0,0,0,7,9,1,1,0.693147,1,1,normal,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False


In [9]:
# This will be our first Logistic Regression Model that predicts the Label (1/0)
# Later we'll create another multi-class predictor model using a subset of the data (where Label == 1) and try to predict the attack_cat

################################################
# Logistic Regression (Optimized with Grid Search)
################################################
# First Model: Label Prediction (1/0)
################################################


# Split features and label
X = df_encoded.drop(['label', 'attack_cat'], axis=1)
y_label = df_encoded['label']



# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size=0.2, random_state=42, stratify=y_label)


# Scaling the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# This will be our first Logistic Regression Model that predicts the Label (1/0)
# Later we'll create another multi-class predictor model using a subset of the data (where Label == 1) and try to predict the attack_cat

################################################
# Logistic Regression (Optimized with Grid Search)
################################################
# First Model: Label Prediction (1/0)
################################################

# Set up Logistic Regression with proper solver
lr = LogisticRegression(
    max_iter=10000,             # More iterations to ensure convergence
    class_weight='balanced',
    solver='lbfgs',              # lbfgs is faster for medium to large data
    random_state=42
)

# Grid Search parameters (only search for C now)
param_grid = {
    'C': [0.1, 1]
}

# Grid Search setup
grid_search = GridSearchCV(
    lr,
    param_grid,
    scoring='f1_weighted',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Train
grid_search.fit(X_train, y_train)

# Best model
best_lr_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Predict
y_pred = best_lr_model.predict(X_test)

# Classification report
report_dict = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

# Show report
print(report_df)

Fitting 3 folds for each of 2 candidates, totalling 6 fits




[CV] END ......................C=1, penalty=l2, solver=saga; total time=165.8min
[CV] END ....................C=0.1, penalty=l2, solver=saga; total time=165.8min
[CV] END ....................C=0.1, penalty=l2, solver=saga; total time=165.9min
[CV] END ....................C=0.1, penalty=l2, solver=saga; total time=165.9min
[CV] END ......................C=1, penalty=l2, solver=saga; total time=165.9min
[CV] END ......................C=1, penalty=l2, solver=saga; total time=165.9min




Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}
              precision    recall  f1-score        support
0              0.993747  0.929545  0.960575  387782.000000
1              0.360733  0.871756  0.510302   17685.000000
accuracy       0.927025  0.927025  0.927025       0.927025
macro avg      0.677240  0.900651  0.735439  405467.000000
weighted avg   0.966138  0.927025  0.940936  405467.000000
