In [None]:
!pip install tensorflow


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.3f' % x)
plt.rcParams["figure.figsize"] = (10,6)

In [None]:
df_0 = pd.read_csv("/content/drive/MyDrive/IDS_Train_Dataset/KDDTrain+.txt")
df= df_0.copy()
df.head()

In [None]:
columns = (['duration'
,'protocol_type'
,'service'
,'flag'
,'src_bytes'
,'dst_bytes'
,'land'
,'wrong_fragment'
,'urgent'
,'hot'
,'num_failed_logins'
,'logged_in'
,'num_compromised'
,'root_shell'
,'su_attempted'
,'num_root'
,'num_file_creations'
,'num_shells'
,'num_access_files'
,'num_outbound_cmds'
,'is_host_login'
,'is_guest_login'
,'count'
,'srv_count'
,'serror_rate'
,'srv_serror_rate'
,'rerror_rate'
,'srv_rerror_rate'
,'same_srv_rate'
,'diff_srv_rate'
,'srv_diff_host_rate'
,'dst_host_count'
,'dst_host_srv_count'
,'dst_host_same_srv_rate'
,'dst_host_diff_srv_rate'
,'dst_host_same_src_port_rate'
,'dst_host_srv_diff_host_rate'
,'dst_host_serror_rate'
,'dst_host_srv_serror_rate'
,'dst_host_rerror_rate'
,'dst_host_srv_rerror_rate'
,'attack'
,'level'])

df.columns = columns

In [None]:
df.head(5)

In [None]:
df.isnull().sum()

In [None]:
#helper function for deeper analysis
def unique_values(df, columns):
    """Prints unique values and their counts for specific columns in the DataFrame."""

    for column_name in columns:
        print(f"Column: {column_name}\n{'-'*30}")
        unique_vals = df[column_name].unique()
        value_counts = df[column_name].value_counts()
        print(f"Unique Values ({len(unique_vals)}): {unique_vals}\n")
        print(f"Value Counts:\n{value_counts}\n{'='*40}\n")

In [None]:
cat_features = df.select_dtypes(include='object').columns
unique_values(df, cat_features)

In [None]:
df.shape

In [None]:
# Check if the column names are correctly assigned
print(df.columns)

# Create the new 'attack' column with values 'normal' or 'attack'
attack_n = []
for i in df['attack']:
    if i == 'normal':
        attack_n.append("normal")
    else:
        attack_n.append("attack")
df['attack'] = attack_n

In [None]:
# Check column names in the DataFrame
print(df.columns)

# If the column name is correct
attack_n = []
for i in df['attack']:
    if i == 'normal':
        attack_n.append("normal")
    else:
        attack_n.append("attack")
df['attack'] = attack_n


In [None]:
df['attack'].unique()

In [None]:
df.hist(bins=43,figsize=(20,30));

In [None]:
# Define a custom color palette
palette = {'tcp': 'skyblue', 'udp': 'orange', 'icmp': 'green'}

plt.figure(figsize=(16,4))
sns.countplot(x='attack', data=df, hue='protocol_type', palette=palette)
plt.xticks(rotation=45)
plt.title('Attack Counts over Protocol Types', fontdict={'fontsize': 16})
plt.show()

In [None]:
# So we can see that most of the attacks are from tcp, then udp, and least attack comes from icmp

In [None]:
df["protocol_type"].value_counts(normalize=True)

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='is_guest_login', hue='attack', data=df, palette='Set2')
plt.xlabel('Is Guest Login')
plt.ylabel('Count')
plt.title('Distribution of Attack Types by Guest Login')
plt.legend(title='Attack Type')
plt.grid(True)
plt.show()

In [None]:
cat_features = df.select_dtypes(include='object').columns
cat_features

In [None]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
clm=['protocol_type', 'service', 'flag', 'attack']
for x in clm:
    df[x]=le.fit_transform(df[x])

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(["attack"], axis=1)
y = df["attack"]

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1,random_state=43)

In [None]:
train_index = X_train.columns
train_index

**Feature Engineering**

In [None]:
from sklearn.feature_selection import chi2
import pandas as pd

# Assuming X_train and y_train are already defined and preprocessed
# Example:
# X_train = df.drop('attack', axis=1)
# y_train = df['attack']

# Apply Chi-Square feature selection
chi_scores, p_values = chi2(X_train, y_train)

# Create a Pandas Series with the Chi-Square scores
chi_scores = pd.Series(chi_scores)
chi_scores.index = X_train.columns  # Ensure X_train is a DataFrame with column names

# Sort the Chi-Square scores in descending order
chi_scores_sorted = chi_scores.sort_values(ascending=False)

# Display the sorted Chi-Square scores
print(chi_scores_sorted)


In [None]:
from sklearn.feature_selection import mutual_info_classif

# Calculate mutual information scores
mutual_info = mutual_info_classif(X_train, y_train)
mutual_info_series = pd.Series(mutual_info, index=X_train.columns)
mutual_info_series.sort_values(ascending=False).plot.bar(figsize=(20, 5))
plt.title('Feature Importance Scores based on Mutual Information')
plt.show()


In [None]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

# Scale features to non-negative values
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Calculate Chi-square scores
chi_scores, p_values = chi2(X_train_scaled, y_train)
chi_scores_series = pd.Series(chi_scores, index=X_train.columns)
chi_scores_series.sort_values(ascending=False).plot.bar(figsize=(20, 5))
plt.title('Feature Importance Scores based on Chi-Square Test')
plt.show()


In [None]:
# Calculate Mutual Information scores
mutual_info_scores = mutual_info_classif(X_train, y_train)
mutual_info_series = pd.Series(mutual_info_scores, index=X_train.columns).sort_values(ascending=False)

# Calculate Chi-Square scores
X_train_scaled = scaler.fit_transform(X_train)
chi_scores, p_values = chi2(X_train_scaled, y_train)
chi_scores_series = pd.Series(chi_scores, index=X_train.columns).sort_values(ascending=False)

# Plot both for comparison
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(20, 10))
mutual_info_series.plot.bar(ax=axes[0], title='Mutual Information Feature Importance')
chi_scores_series.plot.bar(ax=axes[1], title='Chi-Square Feature Importance')
plt.show()


In [None]:
from sklearn.feature_selection import mutual_info_classif, chi2, SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Select features using Mutual Information
mi_selector = SelectKBest(mutual_info_classif, k=10)
X_mi = mi_selector.fit_transform(X_train, y_train)

# Select features using Chi-Square
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
chi_selector = SelectKBest(chi2, k=10)
X_chi = chi_selector.fit_transform(X_train_scaled, y_train)

# Train a model using Mutual Information features
rf_mi = RandomForestClassifier(random_state=42)
rf_mi.fit(X_mi, y_train)
y_pred_mi = rf_mi.predict(mi_selector.transform(X_test))

# Train a model using Chi-Square features
X_test_scaled = scaler.transform(X_test)
rf_chi = RandomForestClassifier(random_state=42)
rf_chi.fit(X_chi, y_train)
y_pred_chi = rf_chi.predict(chi_selector.transform(X_test_scaled))

# Evaluate the models
print("Mutual Information Features Model Performance:")
print(classification_report(y_test, y_pred_mi))
print("Accuracy:", accuracy_score(y_test, y_pred_mi))

print("\nChi-Square Features Model Performance:")
print(classification_report(y_test, y_pred_chi))
print("Accuracy:", accuracy_score(y_test, y_pred_chi))


In [None]:
from sklearn.feature_selection import mutual_info_classif
mutual_info = mutual_info_classif(X_train, y_train)
mutual_info = pd.Series(mutual_info)
mutual_info.index = train_index
mutual_info.sort_values(ascending=False)



In [None]:
mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 5));

In [None]:
from sklearn.feature_selection import SelectKBest
Select_features = SelectKBest(mutual_info_classif, k=30)
Select_features.fit(X_train, y_train)
train_index[Select_features.get_support()]

In [None]:
columns=['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'wrong_fragment', 'hot', 'logged_in', 'num_compromised',
       'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate']

#We will continue our model with top 15 features, because dataset is big enough

X_train=X_train[columns]
X_test=X_test[columns]

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test) # we use only transform in order to prevent data leakage

In [None]:
!pip install xgboost

In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [None]:
XGBoost_model = XGBClassifier(random_state = 42)
Logistic_model = LogisticRegression(random_state=42)

In [None]:
XGBoost = XGBoost_model.fit(X_train,y_train)

In [None]:
Logistic = Logistic_model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

In [None]:
#it's a helper function in order to evaluate our model if it's overfit or underfit.
def eval_metric(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)

    print("Test_Set")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print()
    print("Train_Set")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

In [None]:
eval_metric(Logistic_model, X_train, y_train, X_test, y_test)

In [None]:
eval_metric(XGBoost_model, X_train, y_train, X_test, y_test)

In [None]:
# Initialize the models
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42)

# Train the models
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

# Evaluate the models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print("Test Set Evaluation")
    print(confusion_matrix(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))

    print("Train Set Evaluation")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

# Evaluate Random Forest
print("Random Forest Evaluation:")
evaluate_model(rf_model, X_train, y_train, X_test, y_test)

# Evaluate XGBoost
print("\nXGBoost Evaluation:")
evaluate_model(xgb_model, X_train, y_train, X_test, y_test)

In [None]:
param_grid = {
    "n_estimators": [50,64,100,128],
    "max_depth": [2, 3, 4,5,6],
    "learning_rate": [0.01,0,0.03, 0.05, 0.1],
    "subsample": [0.5, 0.8],
    "colsample_bytree": [0.5, 0.8]
}

In [None]:
# Necessary imports
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assume the dataset (X, y) has already been preprocessed
# Example dataset split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (optional depending on model requirements)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model definitions
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Ensemble Voting Classifier (Soft Voting)
voting_clf = VotingClassifier(estimators=[
    ('rf', rf_clf),
    ('xgb', xgb_clf)],
    voting='soft')

# Train the model
voting_clf.fit(X_train, y_train)

# Make predictions
y_pred = voting_clf.predict(X_test)

# Confusion Matrix Generation
cm = confusion_matrix(y_test, y_pred)

# Visualize Confusion Matrix using heatmap
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Attack'], yticklabels=['Normal', 'Attack'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Voting Classifier')
plt.show()

# Detailed Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['Normal', 'Attack']))

# Optionally, save the confusion matrix figure
plt.savefig("confusion_matrix_voting_classifier.png")


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

# ANN Model
model = Sequential()
model.add(Dense(units=128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

# Evaluate the model
y_pred_ann = (model.predict(X_test) > 0.5).astype(int)

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_ann))

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred_ann))

# Accuracy
ann_accuracy = accuracy_score(y_test, y_pred_ann)
print(f"ANN Accuracy: {ann_accuracy:.4f}")

# Plot Accuracy
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Model Accuracy')
plt.legend()
plt.show()

# Plot Loss
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Model Loss')
plt.legend()
plt.show()


In [None]:
from sklearn.model_selection import GridSearchCV

XGB_model = XGBClassifier(random_state=42) #initialize the model

XGB_grid_model = GridSearchCV(XGB_model,
                        param_grid,
                        scoring="f1",
                        n_jobs=-1,
                        return_train_score=True).fit(X_train, y_train)

In [None]:
!pip install imblearn

In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the training dataset
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Check the new size of the training data after applying SMOTE
print("Training set size after SMOTE (X_train_res):", X_train_res.shape)
print("Training target size after SMOTE (y_train_res):", y_train_res.shape)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize the class distribution before and after SMOTE
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# Before SMOTE
sns.countplot(x=y_train, ax=ax[0], palette="Set2")
ax[0].set_title("Class Distribution Before SMOTE")
ax[0].set_xlabel("Classes")
ax[0].set_ylabel("Count")

# After SMOTE
sns.countplot(x=y_train_res, ax=ax[1], palette="Set2")
ax[1].set_title("Class Distribution After SMOTE")
ax[1].set_xlabel("Classes")
ax[1].set_ylabel("Count")

plt.tight_layout()
plt.show()


In [None]:
XGB_grid_model.best_score_

In [None]:
XGB_grid_model.best_params_

In [None]:
#final model

from xgboost import XGBClassifier

XGB_model = XGBClassifier(
    colsample_bytree=0.5,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=128,
    subsample=0.8
)

# Fit the classifier to your data
XGB_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import f1_score, recall_score, roc_auc_score

# Make predictions with the Random Forest model
y_pred_rf = XGB_model.predict(X_test)
y_pred_proba_rf = XGB_model.predict_proba(X_test)

# Calculate evaluation metrics for the Random Forest model
rf_f1 = f1_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_auc = roc_auc_score(y_test, y_pred_proba_rf[:, 1])

# Print the results
print(f"Random Forest F1 Score: {rf_f1}")
print(f"Random Forest Recall: {rf_recall}")
print(f"Random Forest AUC: {rf_auc}")


In [None]:
eval_metric(XGB_model, X_train, y_train, X_test, y_test)

In [None]:
model = XGB_model
model.feature_importances_

feats = pd.DataFrame(index=X[columns].columns, data= model.feature_importances_, columns=['XGB_importance'])
ada_imp_feats = feats.sort_values("XGB_importance", ascending = False)
ada_imp_feats

In [None]:
## Ensamble learning

In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# Define individual models
logistic_model = LogisticRegression(random_state=42)
xgboost_model = XGBClassifier(random_state=42)
svm_model = SVC(probability=True, random_state=42)


In [None]:
# Logistic Regression + XGBoost Ensemble
ensemble_log_xgb = VotingClassifier(estimators=[
    ('logistic', logistic_model),
    ('xgboost', xgboost_model)
], voting='soft')

# SVM + XGBoost Ensemble
ensemble_svm_xgb = VotingClassifier(estimators=[
    ('svm', svm_model),
    ('xgboost', xgboost_model)
], voting='soft')


In [None]:
# Train ensembles
ensemble_log_xgb.fit(X_train, y_train)
ensemble_svm_xgb.fit(X_train, y_train)

# Predictions
y_pred_log_xgb = ensemble_log_xgb.predict(X_test)
y_pred_svm_xgb = ensemble_svm_xgb.predict(X_test)

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix

print("Logistic Regression + XGBoost Ensemble")
print(confusion_matrix(y_test, y_pred_log_xgb))
print(classification_report(y_test, y_pred_log_xgb))

print("SVM + XGBoost Ensemble")
print(confusion_matrix(y_test, y_pred_svm_xgb))
print(classification_report(y_test, y_pred_svm_xgb))


In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define individual models
logistic_model = LogisticRegression(random_state=42)
xgboost_model = XGBClassifier(random_state=42)
svm_model = SVC(probability=True, random_state=42)
random_forest_model = RandomForestClassifier(random_state=42)

# Logistic Regression + XGBoost Ensemble
ensemble_log_xgb = VotingClassifier(estimators=[
    ('logistic', logistic_model),
    ('xgboost', xgboost_model)
], voting='soft')

# SVM + XGBoost Ensemble
ensemble_svm_xgb = VotingClassifier(estimators=[
    ('svm', svm_model),
    ('xgboost', xgboost_model)
], voting='soft')

# Random Forest + XGBoost Ensemble
ensemble_rf_xgb = VotingClassifier(estimators=[
    ('random_forest', random_forest_model),
    ('xgboost', xgboost_model)
], voting='soft')

# Train ensembles
ensemble_log_xgb.fit(X_train, y_train)
ensemble_svm_xgb.fit(X_train, y_train)
ensemble_rf_xgb.fit(X_train, y_train)

# Predictions
y_pred_log_xgb = ensemble_log_xgb.predict(X_test)
y_pred_svm_xgb = ensemble_svm_xgb.predict(X_test)
y_pred_rf_xgb = ensemble_rf_xgb.predict(X_test)

# Evaluation
def evaluate_model(model_name, y_test, y_pred):
    print(f"{model_name} Ensemble")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred):.4f}")
    print("\n")

evaluate_model("Logistic Regression + XGBoost", y_test, y_pred_log_xgb)
evaluate_model("SVM + XGBoost", y_test, y_pred_svm_xgb)
evaluate_model("Random Forest + XGBoost", y_test, y_pred_rf_xgb)


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt

# Define individual models
logistic_model = LogisticRegression(random_state=42)
svm_model = SVC(probability=True, random_state=42)
decision_tree_model = DecisionTreeClassifier(random_state=42)
naive_bayes_model = GaussianNB()

# Create ensemble model
ensemble_model = VotingClassifier(estimators=[
    ('logistic', logistic_model),
    ('svm', svm_model),
    ('decision_tree', decision_tree_model),
    ('naive_bayes', naive_bayes_model)
], voting='soft')

# Train ensemble model
ensemble_model.fit(X_train, y_train)

# Predictions
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluate the ensemble model
print("Ensemble Model")
print(confusion_matrix(y_test, y_pred_ensemble))
print(classification_report(y_test, y_pred_ensemble))

# ROC Curve for the ensemble model
y_pred_prob_ensemble = ensemble_model.predict_proba(X_test)[:, 1]
fpr_ensemble, tpr_ensemble, _ = roc_curve(y_test, y_pred_prob_ensemble)
roc_auc_ensemble = auc(fpr_ensemble, tpr_ensemble)

plt.figure(figsize=(10, 8))
plt.plot(fpr_ensemble, tpr_ensemble, color='blue', lw=2, label='Ensemble (AUC = %0.2f)' % roc_auc_ensemble)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Ensemble Model')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve for the ensemble model
precision_ensemble, recall_ensemble, _ = precision_recall_curve(y_test, y_pred_prob_ensemble)

plt.figure(figsize=(10, 8))
plt.plot(recall_ensemble, precision_ensemble, color='blue', lw=2, label='Ensemble')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Ensemble Model')
plt.legend(loc="lower left")
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have trained a model named 'ensemble_model'
# and have X_train, X_test, y_train, y_test as your data splits

# Predictions
y_train_pred = ensemble_model.predict(X_train)
y_test_pred = ensemble_model.predict(X_test)

# Generate confusion matrices
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

# Plot confusion matrix for training data
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.heatmap(conf_matrix_train, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - Training Data')
plt.xlabel('Predicted')
plt.ylabel('Actual')

# Plot confusion matrix for testing data
plt.subplot(1, 2, 2)
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - Testing Data')
plt.xlabel('Predicted')
plt.ylabel('Actual')

plt.tight_layout()
plt.show()

# Print classification reports
print("Classification Report - Training Data")
print(classification_report(y_train, y_train_pred))

print("Classification Report - Testing Data")
print(classification_report(y_test, y_test_pred))

# ROC Curve for the ensemble model
y_pred_prob_ensemble = ensemble_model.predict_proba(X_test)[:, 1]
fpr_ensemble, tpr_ensemble, _ = roc_curve(y_test, y_pred_prob_ensemble)
roc_auc_ensemble = auc(fpr_ensemble, tpr_ensemble)

plt.figure(figsize=(10, 8))
plt.plot(fpr_ensemble, tpr_ensemble, color='blue', lw=2, label='Ensemble (AUC = %0.2f)' % roc_auc_ensemble)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Ensemble Model')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve for the ensemble model
precision_ensemble, recall_ensemble, _ = precision_recall_curve(y_test, y_pred_prob_ensemble)

plt.figure(figsize=(10, 8))
plt.plot(recall_ensemble, precision_ensemble, color='blue', lw=2, label='Ensemble')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Ensemble Model')
plt.legend(loc="lower left")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
models = ['SVM', 'SVM +\nXGBoost', 'Decision Tree +\nXGBoost', 'XGBoost', 'Random Forest +\nXGBoost']
metric1 = [0.9844, 0.9965, 0.9946, 0.9834, 0.9986]
metric2 = [0.9835, 0.9953, 0.9929, 0.9848, 0.9986]
metric3 = [0.9876, 0.9982, 0.9970, 0.9868, 0.9986]

x = np.arange(len(models))  # the label locations
width = 0.25  # the width of the bars

# Create figure and axis objects
fig, ax = plt.subplots(figsize=(12, 6))

# Plot bars
rects1 = ax.bar(x - width, metric1, width, label='Metric 1', color='#4e79a7')
rects2 = ax.bar(x, metric2, width, label='Metric 2', color='#f28e2b')
rects3 = ax.bar(x + width, metric3, width, label='Metric 3', color='#e15759')

# Customize the plot
ax.set_ylabel('Performance Score')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()

# Set y-axis limits
ax.set_ylim(0.98, 1.001)

# Add value labels on the bars
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.4f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', rotation=90, fontsize=8)

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

# Adjust layout and display the plot
fig.tight_layout()
plt.show()

# Uncomment the following line to save the figure
# plt.savefig('model_performance_comparison.png', dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

# Assuming you have your data loaded as X and y
# If X is a DataFrame, convert it to a numpy array
if isinstance(X, pd.DataFrame):
    feature_names = X.columns
    X = X.values
else:
    feature_names = [f"feature_{i}" for i in range(X.shape[1])]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection
select_features = SelectKBest(mutual_info_classif, k=30)
X_train_selected = select_features.fit_transform(X_train, y_train)
X_test_selected = select_features.transform(X_test)

# Get selected feature indices
selected_indices = select_features.get_support(indices=True)

# Get selected feature names (limited to top 15)
selected_features = [feature_names[i] for i in selected_indices][:15]

# Limit to top 15 features
X_train_selected = X_train_selected[:, :15]
X_test_selected = X_test_selected[:, :15]

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Define models (added Decision Tree)
models = {
    "Naive Bayes": GaussianNB(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(random_state=42)  # Added Decision Tree
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

print("\nSelected features:")
print(selected_features)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Assuming X and y are already defined
# If X is a DataFrame, convert it to a numpy array
if isinstance(X, pd.DataFrame):
    feature_names = X.columns
    X = X.values
else:
    feature_names = [f"feature_{i}" for i in range(X.shape[1])]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Temporarily using the first 15 features for simplicity
X_train = X_train[:, :15]
X_test = X_test[:, :15]

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

# Train models and calculate metrics
results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)

    results.append({
        'Model': name,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Accuracy': accuracy
    })

# Add the additional models' results
additional_results = [
    {'Model': 'SVM', 'Precision': 0.9844, 'Recall': 0.9835, 'F1 Score': 0.9876, 'Accuracy': None},
    {'Model': 'SVM + XGBoost', 'Precision': 0.9965, 'Recall': 0.9953, 'F1 Score': 0.9982, 'Accuracy': None},
    {'Model': 'Decision Tree + XGBoost', 'Precision': 0.9946, 'Recall': 0.9929, 'F1 Score': 0.9970, 'Accuracy': None},
    {'Model': 'XGBoost', 'Precision': 0.9834, 'Recall': 0.9848, 'F1 Score': 0.9868, 'Accuracy': None},
    {'Model': 'Random Forest + XGBoost', 'Precision': 0.9986, 'Recall': 0.9986, 'F1 Score': 0.9986, 'Accuracy': None}
]

# Append the additional results to the results list
results.extend(additional_results)

# Create DataFrame from the results
results_df = pd.DataFrame(results)

# Create visualization for Precision, Recall, and F1-Score
plt.figure(figsize=(12, 6))
sns.set_style("whitegrid")

# Melt the dataframe for easier plotting
melted_df = pd.melt(results_df, id_vars=['Model'], value_vars=['Precision', 'Recall', 'F1 Score'], var_name='Metric', value_name='Score')

# Create the grouped bar plot
sns.barplot(x='Model', y='Score', hue='Metric', data=melted_df)

# Zoom in on the y-axis (for example, focusing on 0.85 to 1.0)
plt.ylim([0.85, 1.0])

plt.title('Zoomed Model Comparison: Precision, Recall, and F1 Score', fontsize=16)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Score', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Metric', title_fontsize=12, fontsize=10)
plt.tight_layout()

# Show the plot
plt.show()

# Print the Accuracy for each model (ignore for additional models where Accuracy is None)
print("\nAccuracy for each model:")
for index, row in results_df.iterrows():
    if row['Accuracy'] is not None:
        print(f"{row['Model']}: {row['Accuracy']:.4f}")
