In [9]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

# Define column names
columns_names = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land",
"wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
"num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
"num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
"is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate",
"rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
"dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate",
"dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
"dst_host_rerror_rate", "dst_host_srv_rerror_rate", "attack", "last_flag"]

# Load the dataset
df = pd.read_csv("/content/Train.txt", header=None, names=columns_names)

# Keep only necessary columns
columns_to_include = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "count", "srv_count", "serror_rate",
    "rerror_rate", "same_srv_rate", "diff_srv_rate", "attack"
]
df_filtered = df[columns_to_include]

### ðŸ”¹ Step 1: Convert protocol_type (tcp, udp, icmp) to numeric
protocol_mapping = {"tcp": 6, "udp": 17, "icmp": 1}
df_filtered["protocol_type"] = df_filtered["protocol_type"].map(protocol_mapping)

# Fill unknown protocols with a placeholder (e.g., -1)
df_filtered["protocol_type"].fillna(-1, inplace=True)

df_filtered["attack"] = df_filtered["attack"].apply(lambda x: "normal" if x == "normal" else "malicious")

### ðŸ”¹ Step 2: One-Hot Encode `service` (Only Keep `http`)
df_filtered["service"] = df_filtered["service"].apply(lambda x: "http" if x == "http" else "other")

df_filtered = pd.get_dummies(df_filtered, columns=["service"], dtype=int)

### ðŸ”¹ Step 3: Handle `flag` column
# Fill missing flags with "unknown"
df_filtered["flag"] = df_filtered["flag"].fillna("OTH")

valid_flags = {"SF", "S0", "S1", "RSTR"}

df_filtered = df_filtered[df_filtered["flag"].isin(valid_flags)]


df_filtered = pd.get_dummies(df_filtered, columns=["flag"], dtype=int)

### ðŸ”¹ Step 4: Ensure numerical consistency with sniffed data
numeric_columns = ["duration", "src_bytes", "dst_bytes", "land", "wrong_fragment",
                   "urgent", "count", "srv_count", "serror_rate", "rerror_rate",
                   "same_srv_rate", "diff_srv_rate"]

# Convert to float for consistency
df_filtered[numeric_columns] = df_filtered[numeric_columns].astype(float)

### ðŸ”¹ Step 5: Save the processed dataset
processed_file_path = "processed_data.csv"
df_filtered.to_csv(processed_file_path, index=False)

print(f"âœ… Processed dataset saved to: {processed_file_path}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["protocol_type"] = df_filtered["protocol_type"].map(protocol_mapping)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered["protocol_type"].fillna(-1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filt

âœ… Processed dataset saved to: processed_data.csv


In [2]:
df_filtered.describe()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,count,srv_count,serror_rate,rerror_rate,same_srv_rate,diff_srv_rate
count,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0
mean,287.14465,6.980115,31.226469,6.979996,45566.74,19779.11,0.000198,0.022687,0.000111,84.107555,27.737888,0.284485,0.119958,0.660928,0.063053
std,2604.51531,3.884072,16.34647,2.689365,5870331.0,4021269.0,0.014086,0.25353,0.014366,114.508607,72.63584,0.446456,0.320436,0.439623,0.180314
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,6.0,20.0,5.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.09,0.0
50%,0.0,6.0,24.0,9.0,44.0,0.0,0.0,0.0,0.0,14.0,8.0,0.0,0.0,1.0,0.0
75%,0.0,6.0,49.0,9.0,276.0,516.0,0.0,0.0,0.0,143.0,18.0,1.0,0.0,1.0,0.06
max,42908.0,17.0,69.0,10.0,1379964000.0,1309937000.0,1.0,3.0,3.0,511.0,511.0,1.0,1.0,1.0,1.0


In [10]:


len(df_filtered)

112582

In [11]:
df_filtered.columns

Index(['duration', 'protocol_type', 'src_bytes', 'dst_bytes', 'land',
       'wrong_fragment', 'urgent', 'count', 'srv_count', 'serror_rate',
       'rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'attack',
       'service_http', 'service_other', 'flag_RSTR', 'flag_S0', 'flag_S1',
       'flag_SF'],
      dtype='object')

In [12]:
# Define the desired column order
column_order = ['duration', 'protocol_type', 'service_http', 'service_other',
                'flag_RSTR', 'flag_S0', 'flag_S1', 'flag_SF',
                'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent',
                'count', 'srv_count', 'serror_rate', 'rerror_rate',
                'same_srv_rate', 'diff_srv_rate', 'attack']

# Reorder the DataFrame
df_filtered = df_filtered[column_order]

In [13]:
df_filtered.head()

Unnamed: 0,duration,protocol_type,service_http,service_other,flag_RSTR,flag_S0,flag_S1,flag_SF,src_bytes,dst_bytes,land,wrong_fragment,urgent,count,srv_count,serror_rate,rerror_rate,same_srv_rate,diff_srv_rate,attack
0,0.0,6,0,1,0,0,0,1,491.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,normal
1,0.0,17,0,1,0,0,0,1,146.0,0.0,0.0,0.0,0.0,13.0,1.0,0.0,0.0,0.08,0.15,normal
2,0.0,6,0,1,0,1,0,0,0.0,0.0,0.0,0.0,0.0,123.0,6.0,1.0,0.0,0.05,0.07,malicious
3,0.0,6,1,0,0,0,0,1,232.0,8153.0,0.0,0.0,0.0,5.0,5.0,0.2,0.0,1.0,0.0,normal
4,0.0,6,1,0,0,0,0,1,199.0,420.0,0.0,0.0,0.0,30.0,32.0,0.0,0.0,1.0,0.0,normal


In [14]:
df_normal = df_filtered[df_filtered["attack"] == "normal"]
df_malicious = df_filtered[df_filtered["attack"] == "malicious"]

df_normal.to_csv("normal.csv", index=False)
df_malicious.to_csv("malicious.csv", index=False)


In [15]:
len(df_malicious)

48328

In [16]:
len(df_normal)

64254