In [None]:
import pandas as pd

# Load Excel file
xlsx_path = 'modeldata.xlsx'  # Change to your actual filename
df = pd.read_excel(xlsx_path)

# Save as CSV
csv_path = xlsx_path.replace('.xlsx', '.csv')
df.to_csv(csv_path, index=False)

print(f"Converted and saved to: {csv_path}")


Converted and saved to: modeldata.csv


In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('classdata.csv')

# Add or update the 'Label' column based on 'Src IP'
df['Label'] = df['Src IP'].apply(lambda ip: 'IoT Device' if ip == '192.168.219.59' or ip == '192.168.195.59' or ip == '10.101.101.40' or ip == '192.168.190.59' else 'Non-IoT Device')

# Save changes to the same file (overwrite)
df.to_csv('classdata.csv', index=False)

print("Label column updated in 'bothdata.csv'")


Label column updated in 'bothdata.csv'


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# === UTILITIES ===
def clean_df(df):
    # Replace infinite values with NaN, then fill NaNs with 0 (or you could df.dropna())
    return df.replace([np.inf, -np.inf], np.nan).fillna(0)

# === TRAINING ===
train_df = pd.read_csv('classdata.csv')

# Drop IP and Port columns
drop_cols = ['Src IP', 'Dst IP', 'Src Port', 'Dst Port']
train_df = train_df.drop(columns=[c for c in drop_cols if c in train_df.columns])

# Features and labels
X = train_df.drop('Label', axis=1)
y = train_df['Label']

# One-hot encoding
X_encoded = pd.get_dummies(X)

# Clean any infinities / NaNs
X_encoded = clean_df(X_encoded)

# Label encoding
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# XGBoost classifier
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
model.fit(X_train, y_train)

# Evaluate
val_preds = model.predict(X_val)
print("Validation Report:\n",
      classification_report(y_val, val_preds, target_names=le.classes_))

# === PREDICTION ===
test_path = 'malicious_only.csv'
test_df = pd.read_csv(test_path)

# Clean and align test features
test_df_clean = test_df.drop(columns=[c for c in drop_cols if c in test_df.columns], errors='ignore')
test_encoded = pd.get_dummies(test_df_clean)

# Ensure same columns as training
test_encoded = test_encoded.reindex(columns=X_encoded.columns, fill_value=0)

# Clean infinities / NaNs in test set
test_encoded = clean_df(test_encoded)

# Predict classes and probabilities
pred_probs = model.predict_proba(test_encoded)
pred_classes = model.predict(test_encoded)

# Decode labels and attach confidence
test_df['Label'] = le.inverse_transform(pred_classes)
test_df['Confidence'] = pred_probs.max(axis=1)

# Filter: keep only high-confidence rows
filtered_df = test_df[test_df['Confidence'] >= 0.967].copy()

# Count IoT vs Non-IoT
counts = filtered_df['Label'].value_counts()
iot_count = counts.get('IoT', 0)
non_iot_count = counts.sum() - iot_count

print(f"Total high-confidence packets: {len(filtered_df)}")
print(f"IoT packets: {iot_count}")
print(f"Non-IoT packets: {non_iot_count}")

# Save filtered predictions
filtered_df.to_csv(test_path, index=False)
print(f"Filtered predictions (confidence ≥ 0.967) saved to: {test_path}")
filtered_df['Label'].value_counts()

Parameters: { "use_label_encoder" } are not used.



Validation Report:
                 precision    recall  f1-score   support

    IoT Device       1.00      1.00      1.00       307
Non-IoT Device       1.00      1.00      1.00       105

      accuracy                           1.00       412
     macro avg       1.00      1.00      1.00       412
  weighted avg       1.00      1.00      1.00       412

Total high-confidence packets: 1025
IoT packets: 0
Non-IoT packets: 1025
Filtered predictions (confidence ≥ 0.967) saved to: malicious_only.csv


Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
IoT Device,1023
Non-IoT Device,2


In [None]:
!pip install joblib==1.4.2
!pip install numpy==1.24.4
!pip install pandas==2.0.3
!pip install scikit-learn==1.3.2
!pip install xgboost==2.1.4



In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
import joblib
import warnings

# Suppress XGBoost warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Function to replace infinite values with NaN (replaces the lambda)
def replace_inf_with_nan(x):
    return np.where(np.isinf(x), np.nan, x)

# Load labeled training data
df = pd.read_csv('classdata.csv')

# Drop unwanted columns
X = df.drop(['Label', 'Flow ID', 'Timestamp', 'Src IP', 'Dst IP', 'Src Port', 'Dst Port'], axis=1, errors='ignore')
y_raw = df['Label']

# Split data
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X, y_raw, test_size=0.20, stratify=y_raw, random_state=42
)

# Encode string labels to numeric
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)
y_test = label_encoder.transform(y_test_raw)

# Set XGBoost objective
num_classes = len(label_encoder.classes_)
if num_classes == 2:
    objective = 'binary:logistic'
    eval_metric = 'logloss'
else:
    objective = 'multi:softprob'
    eval_metric = 'mlogloss'

# Define XGBoost parameters
xgb_params = {
    'n_estimators': 100,
    'max_depth': 10,
    'learning_rate': 0.1,
    'use_label_encoder': False,
    'eval_metric': eval_metric,
    'objective': objective,
    'random_state': 42,
    'n_jobs': -1
}
if num_classes > 2:
    xgb_params['num_class'] = num_classes

# Preprocessing pipeline
cat_cols = ['Protocol']
num_cols = [c for c in X_train_raw.columns if c not in cat_cols]
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
    ('num', Pipeline([
        ('clip', FunctionTransformer(replace_inf_with_nan, validate=False)),
        ('impute', SimpleImputer(strategy='median'))
    ]), num_cols)
])

# Build training pipeline
train_pipeline = Pipeline([
    ('pre', preprocessor),
    ('xgb', XGBClassifier(**xgb_params))
])

# Train the model
train_pipeline.fit(X_train_raw, y_train)

# Save the trained model
joblib.dump(train_pipeline, 'classifier.pkl')
print("✅ Trained model saved to 'classifier.pkl'")

# ───────────────────────────────────────────────────────────
# Load model and predict on test.csv
# ───────────────────────────────────────────────────────────
print("\n🔄 Loading model for predictions...")
loaded_pipeline = joblib.load('classifier.pkl')

# Evaluate loaded model on hold-out test set
y_pred = loaded_pipeline.predict(X_test_raw)
print("\n📊 Evaluation on Test Set:")
print("Number of features used for training:", loaded_pipeline.named_steps['pre'].transform(X_train_raw).shape[1])
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Load test data
test_df = pd.read_csv('iotdata.csv')
test_features = test_df.drop(['Flow ID', 'Timestamp', 'Src IP', 'Dst IP', 'Src Port', 'Dst Port', 'Label'], axis=1, errors='ignore')

# Predict using loaded model
test_pred_numeric = loaded_pipeline.predict(test_features)
test_pred_label = label_encoder.inverse_transform(test_pred_numeric)
test_proba = loaded_pipeline.predict_proba(test_features).max(axis=1)

# Add predictions to DataFrame
test_df['Predicted_Label'] = test_pred_label
test_df['Confidence'] = test_proba

# Filter low-confidence predictions
filtered_df = test_df[test_df['Confidence'] >= 0.80]

# Summary
print("\n📈 Prediction Summary (Confidence ≥ 0.65):")
print(filtered_df['Predicted_Label'].value_counts())
print("\nPrediction Proportions (%):")
print((filtered_df['Predicted_Label'].value_counts(normalize=True) * 100).round(2))

# Save predictions
filtered_df.to_csv('iotdata.csv', index=False)
print(f"\n✅ Filtered predictions saved to 'iotdata.csv' (kept {len(filtered_df)} of {len(test_df)} rows)")

✅ Trained model saved to 'attack.pkl'

🔄 Loading model for predictions...

📊 Evaluation on Test Set:
Number of features used for training: 79
Classification Report:
                 precision    recall  f1-score   support

    IoT Device       1.00      1.00      1.00       682
Non-IoT Device       1.00      1.00      1.00       301

      accuracy                           1.00       983
     macro avg       1.00      1.00      1.00       983
  weighted avg       1.00      1.00      1.00       983

Confusion Matrix:
 [[681   1]
 [  1 300]]
Accuracy: 0.9979654120040692

📈 Prediction Summary (Confidence ≥ 0.65):
Predicted_Label
Non-IoT Device    579
IoT Device         48
Name: count, dtype: int64

Prediction Proportions (%):
Predicted_Label
Non-IoT Device    92.34
IoT Device         7.66
Name: proportion, dtype: float64

✅ Filtered predictions saved to 'other_data_labeled.csv' (kept 627 of 631 rows)


In [56]:
import pandas as pd
import numpy as np
import joblib

# ─── 1) Load your trained classifier pipeline ───────────────────
pipeline = joblib.load('classifier.pkl')

# ─── 2) Load & preprocess the new data ─────────────────────────
df = pd.read_csv('iotdata.csv')
drop_cols = ['Flow ID','Timestamp','Src IP','Dst IP','Src Port','Dst Port','Label']
X_new = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

# ─── 3) Run predictions ─────────────────────────────────────────
pred_numeric = pipeline.predict(X_new)          # numeric class codes
probs        = pipeline.predict_proba(X_new)    # probabilities

# ─── 4) Attach numeric labels & confidences ────────────────────
df['Predicted_Label'] = pred_numeric
df['Confidence']      = np.max(probs, axis=1)

# ─── 5) (Optional) filter low-confidence predictions ───────────
threshold = 0.80
filtered_df = df[df['Confidence'] >= threshold]

# ─── 6) Save the results ───────────────────────────────────────
output_file = 'noiotdata.csv'
filtered_df.to_csv(output_file, index=False)

print(f"✅ Saved {len(filtered_df)}/{len(df)} rows to {output_file}")
print("Label counts:\n", filtered_df['Predicted_Label'].value_counts())


✅ Saved 214/214 rows to noiotdata.csv
Label counts:
 Predicted_Label
0    198
1     16
Name: count, dtype: int64
