In [5]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the KDD Cup 1999 dataset
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
features = [
    "duration", "protocol_type", "service", "src_bytes", "dst_bytes", "land", 
    "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", 
    "num_compromised", "root_shell", "su_attempted", "num_root", 
    "num_file_creations", "num_shells", "num_access_files", 
    "num_outbound_cmds", "is_host_login", "is_guest_login", "count", 
    "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", 
    "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", 
    "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", 
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", 
    "dst_host_serror_rate", "dst_host_srv_serror_rate", 
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate",
    "target", "flag"
]
data = pd.read_csv(url, names=features)

In [7]:
data.head()

Unnamed: 0,duration,protocol_type,service,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target,flag
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [8]:
duplicateRowsDF = data[data.duplicated()]

In [9]:
duplicateRowsDF.head(5)

Unnamed: 0,duration,protocol_type,service,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target,flag
2500,0,tcp,http,SF,307,60990,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
2864,0,tcp,http,SF,351,11485,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
2977,0,tcp,http,SF,506,2419,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
3003,0,tcp,http,SF,319,1374,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
3130,0,tcp,http,SF,234,1212,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.


In [10]:
data.drop_duplicates(subset=features, keep='first', inplace=True)
data.shape

(145586, 42)

In [12]:
data = data.dropna(axis='columns')

In [13]:
categorical_columns = []
continuous_columns = []

for col in data.columns:
    if data[col].dtype == 'object':
        categorical_columns.append(col)
    else:
        continuous_columns.append(col)

In [14]:
print("Continuous columns:")
print(continuous_columns)
print("\nCategorical columns:")
print(categorical_columns)

Continuous columns:
['duration', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'target']

Categorical columns:
['protocol_type', 'service', 'src_bytes', 'flag']


In [17]:
for col in categorical_columns:
    one_hot_encoded = pd.get_dummies(data[col], prefix=col)
    data = pd.concat([data, one_hot_encoded], axis=1)
    data.drop(col, axis=1, inplace=True)

In [11]:
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['target'])

In [19]:
imputer = SimpleImputer(strategy='mean')
df_imputed = imputer.fit_transform(data)

In [20]:
df_preprocessed = pd.DataFrame(df_imputed, columns=data.columns)

In [21]:
print(df_preprocessed.head())

   duration  dst_bytes    land  wrong_fragment  urgent  hot  \
0       0.0      181.0  5450.0             0.0     0.0  0.0   
1       0.0      239.0   486.0             0.0     0.0  0.0   
2       0.0      235.0  1337.0             0.0     0.0  0.0   
3       0.0      219.0  1337.0             0.0     0.0  0.0   
4       0.0      217.0  2032.0             0.0     0.0  0.0   

   num_failed_logins  logged_in  num_compromised  root_shell  ...  flag_phf.  \
0                0.0        0.0              1.0         0.0  ...        0.0   
1                0.0        0.0              1.0         0.0  ...        0.0   
2                0.0        0.0              1.0         0.0  ...        0.0   
3                0.0        0.0              1.0         0.0  ...        0.0   
4                0.0        0.0              1.0         0.0  ...        0.0   

   flag_pod.  flag_portsweep.  flag_rootkit.  flag_satan.  flag_smurf.  \
0        0.0              0.0            0.0          0.0         

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [23]:
X = df_preprocessed.drop(columns=['target'])  # Features
y = df_preprocessed['target']  # Target variable

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
import matplotlib.pyplot as plt

In [26]:
# Assuming X_train and y_train are your training features and target
# Fit RandomForestClassifier model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to hold feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'][:39], feature_importance_df['Importance'][:39])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importances')
plt.gca().invert_yaxis()  # Invert y-axis to display highest importance at the top
plt.show()


ValueError: Unknown label type: 'continuous'

In [27]:
df.head()# Set a threshold for selecting important features
threshold = 0.005  # Adjust this threshold as needed

# Select features with importance scores above the threshold
selected_features = feature_importance_df[feature_importance_df['Importance'] > threshold]['Feature']

# Subset the training features (X_train) based on selected features
X_train_selected = X_train[selected_features]

# Use the original target variable (y_train) as-is
y_train_selected = y_train

# Now, you can use X_train_selected and y_train_selected for model training


NameError: name 'df' is not defined

In [39]:
# Subset the testing features (X_test) based on selected features
X_test_selected = X_test[selected_features]


In [40]:
model_selected_features = RandomForestClassifier()
model_selected_features.fit(X_train_selected, y_train)

# Evaluate the model on the testing set
accuracy = model_selected_features.score(X_test_selected, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9891604675876727


In [2]:
import joblib

In [3]:
joblib.dump(model_selected_features,'random_forest_mode1.sav')

NameError: name 'model_selected_features' is not defined