In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import joblib
import xgboost as xgb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cicflowdump/flows.csv
/kaggle/input/xgboost-final/other/default/1/XGBoost-intersection.json


In [2]:
# Load the dataset
data = pd.read_csv("/kaggle/input/cicflowdump/flows.csv")
data.head()

Unnamed: 0,src_ip,dst_ip,src_port,dst_port,protocol,timestamp,flow_duration,flow_byts_s,flow_pkts_s,fwd_pkts_s,...,bwd_pkts_b_avg,fwd_blk_rate_avg,bwd_blk_rate_avg,fwd_seg_size_avg,bwd_seg_size_avg,cwr_flag_count,subflow_fwd_pkts,subflow_bwd_pkts,subflow_fwd_byts,subflow_bwd_byts
0,192.168.1.4,52.71.2.195,58124,443,6,2024-08-05 09:05:15,16123247.0,133.0377,1.240445,0.620222,...,0.0,0.0,0.0,114.2,100.3,0,10,10,1142,1003
1,140.82.112.25,192.168.1.4,443,58116,6,2024-08-05 09:05:16,306324.0,1044.646,13.058069,6.529035,...,0.0,0.0,0.0,79.0,81.0,0,2,2,158,162
2,192.168.1.4,157.240.192.52,57348,5222,6,2024-08-05 09:05:17,282890.0,1435.187,14.139772,7.069886,...,0.0,0.0,0.0,101.0,102.0,0,2,2,202,204
3,192.168.1.3,192.168.1.255,40953,15600,17,2024-08-05 09:05:17,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,77.0,0.0,0,1,0,77,0
4,192.168.1.4,116.119.77.13,60179,443,17,2024-08-05 09:05:22,62280.0,17427810.0,14916.506101,1461.143224,...,23.742857,755453.501722,211408200.0,107.186813,1283.591885,0,91,838,9754,1075650


In [3]:
# Mapping of original column names to the new column names
column_mapping = {
    'dst_port': 'Dst Port',
    'totlen_fwd_pkts': 'TotLen Fwd Pkts',
    'flow_iat_mean': 'Flow IAT Mean',
    'flow_iat_max': 'Flow IAT Max',
    'fwd_iat_tot': 'Fwd IAT Tot',
    'fwd_iat_mean': 'Fwd IAT Mean',
    'fwd_iat_max': 'Fwd IAT Max',
    'fwd_iat_min': 'Fwd IAT Min',
    'fwd_header_len': 'Fwd Header Len',
    'fwd_pkts_s': 'Fwd Pkts/s',
    'bwd_pkts_s': 'Bwd Pkts/s',
    'subflow_fwd_byts': 'Subflow Fwd Byts',
    'init_fwd_win_byts': 'Init Fwd Win Byts',
    'init_bwd_win_byts': 'Init Bwd Win Byts',
    'timestamp': 'Timestamp',
    # Date and Time need to be created or extracted, since they're not in the original columns
}

# Select only the relevant columns
relevant_columns = list(column_mapping.keys())

# Create a new DataFrame with the selected columns
X = data[relevant_columns].copy()

# Rename the columns
X.rename(columns=column_mapping, inplace=True)

# Create 'Date' and 'Time' columns from 'timestamp'
X['Timestamp'] = pd.to_datetime(X['Timestamp'], format='%Y-%m-%d %H:%M:%S')
X['Date'] = X['Timestamp'].dt.date.apply(lambda x: int(x.strftime('%Y%m%d')))
X['Time'] = X['Timestamp'].dt.time.apply(lambda x: int(x.strftime('%H%M%S')))
X = X.drop(columns=['Timestamp'])

# Display the first few rows of the processed DataFrame
X.head()

Unnamed: 0,Dst Port,TotLen Fwd Pkts,Flow IAT Mean,Flow IAT Max,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Fwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Subflow Fwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Date,Time
0,443,1142,848591.947368,5006195.0,16123247.0,1791472.0,5006195.0,2186.0,200,0.620222,0.620222,1142,2048,425,20240805,90515
1,58116,158,102108.0,305998.0,0.0,0.0,0.0,0.0,40,6.529035,6.529035,158,74,2048,20240805,90516
2,5222,202,94296.666667,238460.0,0.0,0.0,0.0,0.0,40,7.069886,7.069886,202,2048,538,20240805,90517
3,15600,77,0.0,0.0,0.0,0.0,0.0,0.0,8,0.0,0.0,77,0,0,20240805,90517
4,443,9754,67.112069,13041.0,56790.0,631.0,14814.0,1.0,728,1461.143224,13455.362877,9754,0,0,20240805,90522


In [4]:
# load the model
model = xgb.Booster()
model.load_model('/kaggle/input/xgboost-final/other/default/1/XGBoost-intersection.json')

In [5]:
# Convert the DataFrame to DMatrix, the format expected by XGBoost for prediction
dmatrix = xgb.DMatrix(X)

In [6]:
# Make predictions
predictions = model.predict(dmatrix)

# Convert probabilities to class labels
predicted_labels = np.argmax(predictions, axis=1)

# Print the class labels
print(predicted_labels)

[0 0 0 0 0 0 0]
