In [None]:
# Install dependencies

!pip3 install pandas
!pip3 install scikit-learn mlflow

In [None]:
# Import library

import pandas as pd
import mlflow


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold

In [4]:
# Read dataset
df = pd.read_csv('data/train.csv')

In [11]:
# Check dataset
df.head()
print("Dataset column")
print(df.columns)
print("Summary of dataset info")
print(df.info)

# for col in df.columns:
#   if df[col].dtype != 'object':  # Exclude non-numeric columns
#     min_val = df[col].min()
#     max_val = df[col].max()
#     print(f"Column: {col}")
#     print(f"Minimum: {min_val}")
#     print(f"Maximum: {max_val}")
#     print()

print(df.isnull().sum())


Dataset column
Index(['ID', 'flow_duration', 'Header_Length', 'Protocol type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count',
       'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
       'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC',
       'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number',
       'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'Label'],
      dtype='object')
Summary of dataset info
<bound method DataFrame.info of               ID  flow_duration  Header_Length  Protocol type  Duration  \
0         769866       0.000000          54.00           6.00     64.00   
1        1859874       0.000892          54.58            NaN     64.00   
2         396092       0.000000           0.00           1.00     64.00   
3         17970

In [6]:
# Fill all null data as -1
print(df.isnull().sum())
data_n_null = df.fillna(-1, inplace=False)
print(data_n_null.isnull().sum())

data_n_null.head()
print(data_n_null.duplicated().sum())


ID                      0
flow_duration           0
Header_Length      195013
Protocol type      195013
Duration           195013
Rate               195013
Srate              195013
Drate              195013
fin_flag_number         0
syn_flag_number         0
rst_flag_number    195013
psh_flag_number    195013
ack_flag_number         0
ece_flag_number    195013
cwr_flag_number    195013
ack_count          195013
syn_count          195013
fin_count               0
urg_count               0
rst_count               0
HTTP               195013
HTTPS              195013
DNS                     0
Telnet             195013
SMTP               195013
SSH                195013
IRC                     0
TCP                195013
UDP                     0
DHCP                    0
ARP                195013
ICMP               195013
IPv                     0
LLC                     0
Tot sum            195013
Min                195013
Max                195013
AVG                     0
Std         

In [7]:
# Set mlflow as tracking server
mlflow.set_tracking_uri("http://localhost:5000")

In [None]:
# Random forest follow this: https://gist.github.com/pb111/88545fa33780928694388779af23bf58
data = data_n_null.drop(columns=['ID'])
data_sample = data.sample(frac=0.2, random_state=42)


numerical_columns = data_sample.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
data_sample[numerical_columns] = scaler.fit_transform(data_sample[numerical_columns])

X = data_sample.drop(columns=['Label'])  # Features
y = data_sample['Label']                # Target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

mlflow.set_experiment("Random Forest Experiment")
mlflow.sklearn.autolog()


kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_metrics = []


2024/11/29 23:39:09 INFO mlflow.tracking.fluent: Experiment with name 'Random Forest Experiment' does not exist. Creating a new experiment.


In [9]:


for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train), 1):
    # Split training data into train and validation sets for the current fold
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Train the model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_fold, y_train_fold)
    
    # Predict on the validation fold
    y_val_pred = model.predict(X_val_fold)
    
    # Compute metrics
    cm = confusion_matrix(y_val_fold, y_val_pred)
    tn, fp, fn, tp = cm.ravel() if cm.size == 4 else (None, None, None, None)  # Handle binary or multiclass cases
    accuracy = accuracy_score(y_val_fold, y_val_pred)
    precision = precision_score(y_val_fold, y_val_pred, average='weighted')
    recall = recall_score(y_val_fold, y_val_pred, average='weighted')
    f1 = f1_score(y_val_fold, y_val_pred, average='weighted')
    
    # Store metrics for this fold
    fold_metrics.append({
        "Fold": fold,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "False Positives": fp,
        "False Negatives": fn,
    })
    print(f"Fold {fold} Results:")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    print(f"False Positives: {fp}, False Negatives: {fn}")
    print("-" * 40)
    

2024/11/29 23:39:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '40fb15ad629c450a8233e04d2451421f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run selective-wasp-353 at: http://localhost:5000/#/experiments/1/runs/40fb15ad629c450a8233e04d2451421f
🧪 View experiment at: http://localhost:5000/#/experiments/1


2024/11/29 23:41:43 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'baac1469d46c485e94734ac9bc7bdfcb', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fold 1 Results:
Accuracy: 0.9236, Precision: 0.9255, Recall: 0.9236, F1 Score: 0.9212
False Positives: None, False Negatives: None
----------------------------------------
🏃 View run agreeable-tern-74 at: http://localhost:5000/#/experiments/1/runs/baac1469d46c485e94734ac9bc7bdfcb
🧪 View experiment at: http://localhost:5000/#/experiments/1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 2 Results:
Accuracy: 0.9228, Precision: 0.9244, Recall: 0.9228, F1 Score: 0.9199
False Positives: None, False Negatives: None
----------------------------------------


2024/11/29 23:44:01 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1ccc0bb26ad5475b8b3e8b3b1f02b4b5', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run melodic-ram-406 at: http://localhost:5000/#/experiments/1/runs/1ccc0bb26ad5475b8b3e8b3b1f02b4b5
🧪 View experiment at: http://localhost:5000/#/experiments/1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 3 Results:
Accuracy: 0.9234, Precision: 0.9248, Recall: 0.9234, F1 Score: 0.9206
False Positives: None, False Negatives: None
----------------------------------------


2024/11/29 23:46:09 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'cb17a5077c2d4d7ab4a20f5bfea0cf1b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run inquisitive-bat-70 at: http://localhost:5000/#/experiments/1/runs/cb17a5077c2d4d7ab4a20f5bfea0cf1b
🧪 View experiment at: http://localhost:5000/#/experiments/1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2024/11/29 23:48:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4871f34fd82341e897bd787055815157', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fold 4 Results:
Accuracy: 0.9231, Precision: 0.9247, Recall: 0.9231, F1 Score: 0.9203
False Positives: None, False Negatives: None
----------------------------------------
🏃 View run abundant-stag-218 at: http://localhost:5000/#/experiments/1/runs/4871f34fd82341e897bd787055815157
🧪 View experiment at: http://localhost:5000/#/experiments/1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 5 Results:
Accuracy: 0.9233, Precision: 0.9241, Recall: 0.9233, F1 Score: 0.9206
False Positives: None, False Negatives: None
----------------------------------------
