In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd

In [2]:
# Load the dataset
file_path = "C:\\Users\\maldo\\Downloads\\rt-iot2022\\RT_IOT2022"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

Unnamed: 0.1,Unnamed: 0,id.orig_p,id.resp_p,proto,service,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,...,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,Attack_type
0,0,38667,1883,tcp,mqtt,32.011598,9,5,3,3,...,0.0,29729180.0,29729180.0,29729180.0,29729180.0,0.0,64240,26847,502,MQTT_Publish
1,1,51143,1883,tcp,mqtt,31.883584,9,5,3,3,...,0.0,29855280.0,29855280.0,29855280.0,29855280.0,0.0,64240,26847,502,MQTT_Publish
2,2,44761,1883,tcp,mqtt,32.124053,9,5,3,3,...,0.0,29842150.0,29842150.0,29842150.0,29842150.0,0.0,64240,26847,502,MQTT_Publish
3,3,60893,1883,tcp,mqtt,31.961063,9,5,3,3,...,0.0,29913770.0,29913770.0,29913770.0,29913770.0,0.0,64240,26847,502,MQTT_Publish
4,4,51087,1883,tcp,mqtt,31.902362,9,5,3,3,...,0.0,29814700.0,29814700.0,29814700.0,29814700.0,0.0,64240,26847,502,MQTT_Publish


In [3]:
# Identify columns with a single unique value
single_value_columns = data.nunique()
single_value_columns = single_value_columns[single_value_columns <= 1].index.tolist()

# Potentially irrelevant columns
potentially_irrelevant_columns = ['Unnamed: 0'] + single_value_columns  # 'Unnamed: 0' seems like an identifier

# Remove the identified irrelevant columns
cleaned_data = data.drop(columns=potentially_irrelevant_columns)

# Display the first few rows of the cleaned dataset to confirm removal
cleaned_data.head()

Unnamed: 0,id.orig_p,id.resp_p,proto,service,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,...,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,Attack_type
0,38667,1883,tcp,mqtt,32.011598,9,5,3,3,0.281148,...,0.0,29729180.0,29729180.0,29729180.0,29729180.0,0.0,64240,26847,502,MQTT_Publish
1,51143,1883,tcp,mqtt,31.883584,9,5,3,3,0.282277,...,0.0,29855280.0,29855280.0,29855280.0,29855280.0,0.0,64240,26847,502,MQTT_Publish
2,44761,1883,tcp,mqtt,32.124053,9,5,3,3,0.280164,...,0.0,29842150.0,29842150.0,29842150.0,29842150.0,0.0,64240,26847,502,MQTT_Publish
3,60893,1883,tcp,mqtt,31.961063,9,5,3,3,0.281593,...,0.0,29913770.0,29913770.0,29913770.0,29913770.0,0.0,64240,26847,502,MQTT_Publish
4,51087,1883,tcp,mqtt,31.902362,9,5,3,3,0.282111,...,0.0,29814700.0,29814700.0,29814700.0,29814700.0,0.0,64240,26847,502,MQTT_Publish


In [11]:
data_encoded = pd.get_dummies(cleaned_data, columns=['proto', 'service'])

# Assuming `data_encoded` now holds fully numerical dataset
X = data_encoded.drop('Attack_type', axis=1)  # Exclude the target variable for PCA
y = data_encoded['Attack_type']  # This assumes `Attack_type` has been properly converted to numerical format

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=0.95)  # Adjust based on desired variance retention
X_pca = pca.fit_transform(X_scaled)

# Output the results
print("Number of components kept: ", pca.n_components_)
print("Explained variance ratio: ", pca.explained_variance_ratio_.sum())

Number of components kept:  32
Explained variance ratio:  0.9549256505616013


In [12]:
# Convert the PCA-transformed data back into a DataFrame
columns = [f'PC{i+1}' for i in range(X_pca.shape[1])]
pca_transformed_df = pd.DataFrame(X_pca, columns=columns)

# Display the first few rows of the transformed dataset
print(pca_transformed_df.head())

        PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0  9.181003 -6.828075  2.017667  3.493589 -2.778754  3.234117  0.615673   
1  9.187093 -6.837351  2.052823  3.462516 -2.813211  3.165257  0.414530   
2  9.204040 -6.850761  2.048676  3.493847 -2.792665  3.249311  0.604165   
3  9.214250 -6.857204  2.093186  3.447216 -2.827063  3.196384  0.406184   
4  9.196380 -6.839160  2.054188  3.461204 -2.806359  3.176410  0.453214   

        PC8       PC9      PC10  ...      PC23      PC24      PC25      PC26  \
0  0.126813  1.635588 -1.821072  ...  0.148239  0.120094  0.158559 -0.035362   
1  0.068961  1.752168 -1.800029  ...  0.158393  0.273758  0.146082 -0.113306   
2  0.079998  1.664626 -1.815278  ...  0.154272  0.208125  0.159944 -0.076916   
3 -0.009204  1.783648 -1.777180  ...  0.167873  0.422677  0.154546 -0.180699   
4  0.060387  1.727191 -1.786030  ...  0.157689  0.279946  0.154514 -0.115990   

       PC27      PC28      PC29      PC30      PC31      PC32  
0  0

In [13]:
# Get the PCA components (loadings) of each principal component
loadings = pd.DataFrame(pca.components_.T, columns=columns, index=data_encoded.drop('Attack_type', axis=1).columns)

# Display the loadings of the first few principal components
print(loadings.head())

# Optionally, examine the explained variance of each principal component
explained_variance = pd.DataFrame(pca.explained_variance_ratio_, index=columns, columns=['Explained Variance'])
print(explained_variance.head())

                    PC1       PC2       PC3       PC4       PC5       PC6  \
id.orig_p      0.042447 -0.014949  0.060254 -0.057154 -0.020129  0.042753   
id.resp_p     -0.005321 -0.011028  0.100577 -0.028184 -0.016064  0.051762   
flow_duration  0.071071  0.022230  0.080763  0.089717  0.320079  0.043140   
fwd_pkts_tot   0.107078  0.116300  0.074584  0.083850  0.243081  0.095024   
bwd_pkts_tot   0.105243  0.240189  0.059194  0.117260 -0.100214 -0.008594   

                    PC7       PC8       PC9      PC10  ...      PC23  \
id.orig_p     -0.043645 -0.147088  0.063071  0.051785  ...  0.018044   
id.resp_p     -0.046242 -0.039362  0.020200  0.145856  ... -0.001436   
flow_duration -0.120218 -0.013884 -0.049916 -0.072925  ... -0.007336   
fwd_pkts_tot   0.038011 -0.009283  0.037100 -0.039392  ...  0.004188   
bwd_pkts_tot  -0.005131  0.008112 -0.000818  0.017235  ...  0.007817   

                   PC24      PC25      PC26      PC27      PC28      PC29  \
id.orig_p      0.288436  0.

In [16]:
# Assuming X_pca is PCA-transformed features and y is the target variable
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
random_forest_model.fit(X_train, y_train)

In [18]:
# Make predictions on the test set
y_pred = random_forest_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9971572449642625
                            precision    recall  f1-score   support

            ARP_poisioning       0.97      0.99      0.98      1578
            DDOS_Slowloris       0.99      1.00      1.00       100
             DOS_SYN_Hping       1.00      1.00      1.00     18897
              MQTT_Publish       1.00      1.00      1.00       871
Metasploit_Brute_Force_SSH       0.83      0.83      0.83         6
             NMAP_FIN_SCAN       1.00      0.67      0.80         3
         NMAP_OS_DETECTION       1.00      1.00      1.00       393
             NMAP_TCP_scan       1.00      1.00      1.00       220
             NMAP_UDP_SCAN       1.00      0.98      0.99       489
       NMAP_XMAS_TREE_SCAN       1.00      0.99      1.00       384
               Thing_Speak       0.99      0.98      0.98      1625
                Wipro_bulb       1.00      0.84      0.92        58

                  accuracy                           1.00     24624
                 

In [21]:
explained_variance.reset_index

<bound method DataFrame.reset_index of       Explained Variance
PC1             0.171079
PC2             0.111220
PC3             0.089963
PC4             0.084553
PC5             0.071052
PC6             0.044223
PC7             0.034369
PC8             0.033565
PC9             0.028635
PC10            0.023875
PC11            0.021491
PC12            0.019984
PC13            0.019341
PC14            0.017782
PC15            0.016662
PC16            0.013358
PC17            0.012821
PC18            0.012557
PC19            0.011775
PC20            0.011113
PC21            0.010786
PC22            0.010753
PC23            0.010730
PC24            0.010350
PC25            0.009382
PC26            0.009213
PC27            0.008654
PC28            0.008390
PC29            0.007392
PC30            0.007140
PC31            0.006634
PC32            0.006082>