In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('neww_data.csv')
df

In [None]:
df.columns = df.columns.str.replace(' ','')
df.columns

In [None]:
# Describe the dataset
print(df.describe())

# Check for class distribution
print(df.Inbound.value_counts())

In [None]:
df1 = df.copy()

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Select numerical columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Apply Standard Scaling
scaler = MinMaxScaler()
df1[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:
df1.info()

In [None]:
df1.head(10)

In [None]:
# Importing libraries
import numpy as np
from sklearn.cluster import KMeans

# Applying KMeans clustering
kmeans = KMeans(n_clusters=3, n_init=15)
clusters = kmeans.fit_predict(df1)

# Add the cluster as a new label in the original DataFrame
df1['cluster'] = clusters

# Assuming the smallest cluster is an anomaly, assign a binary label
df1['label'] = np.where(df1['cluster'] == df1['cluster'].value_counts().idxmin(), 1, 0)

In [None]:
df1.cluster.unique(), df1.label.unique()

In [None]:
df1.label.value_counts()

In [None]:
from sklearn.feature_selection import mutual_info_classif

# Calculate mutual information
X = df1.drop(['label','cluster'], axis=1)
y = df1['label']
mi = mutual_info_classif(X, y)
mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)
print(mi_series.head(10))

In [None]:
print(mi_series.head(15))

In [None]:
len(mi_series)

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

# Recursive feature elimination with cross-validation
estimator = RandomForestClassifier()
selector = RFECV(estimator, step=1, cv=4)
selector = selector.fit(X, y)
print(selector.support_)  # Selected features

In [None]:
# Compare features selected by MI and RFECV
selected_features_mi = mi_series.head(10).index
selected_features_rfecv = X.columns[selector.support_]
print(set(selected_features_mi).intersection(set(selected_features_rfecv)))

In [None]:
mi_rfecv_combined = set(selected_features_mi).union(set(selected_features_rfecv))
mi_rfecv_combined

In [None]:
mi_rfecv_intersection = set(selected_features_mi).intersection(set(selected_features_rfecv))
mi_rfecv_intersection

In [None]:
selected_features_mi, selected_features_mi.shape

In [None]:
selected_features_rfecv, selected_features_rfecv.shape

In [None]:
df1.label.value_counts()

In [None]:
combined_features_list = list(mi_rfecv_intersection)
combined_features_list

In [None]:
len(combined_features_list)

In [None]:
combined_features_df = pd.DataFrame(df1[combined_features_list])
combined_features_df.head()

In [None]:
combined_features_df_with_label = combined_features_df.join(y)
combined_features_df_with_label

In [None]:
combined_features_df_with_label.label.value_counts()

In [None]:
X = combined_features_df_with_label.drop('label', axis=1)
X

In [None]:
y = y = combined_features_df_with_label.label
y

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True)

In [None]:
X_train.shape, y_train.shape

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

# Define the model
svm = SVC(probability=True)
adaboost = AdaBoostClassifier(base_estimator=svm)

# Define the hyperparameter grid
param_grid = {
    'base_estimator__C': [0.1, 1, 10],
    'n_estimators': [50, 100, 150],
}

# Grid Search on the full dataset
grid_search = GridSearchCV(estimator=adaboost, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Extracting the best parameters
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [None]:
best_params, best_score

In [None]:
from sklearn.svm import SVC

# Initialize and train the SVM model
svm_model = SVC(kernel='linear', C=0.1, probability=True)
svm_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluate the SVM model
print("SVM Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Initialize and train the AdaBoost model with SVM as the base estimator
adaboost_svm_model = AdaBoostClassifier(base_estimator=SVC(kernel='linear', C=0.1, probability=True),
                                        n_estimators=50, learning_rate=1.0, random_state=42)
adaboost_svm_model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred_adaboost = adaboost_svm_model.predict(X_test)

# Evaluate the AdaBoost-SVM model
print("AdaBoost-SVM Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_adaboost))
print("\nClassification Report:\n", classification_report(y_test, y_pred_adaboost))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
# Evaluate performance
print("AdaBoost - Accuracy:", accuracy_score(y_test, y_pred_adaboost))
print("SVM - Accuracy:", accuracy_score(y_test, y_pred_svm))

In [None]:
#Calculate each metric individually
precision = precision_score(y_test, y_pred_svm)
recall = recall_score(y_test, y_pred_svm)
f1 = f1_score(y_test, y_pred_svm)

print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

In [None]:
#Calculate each metric individually
precision = precision_score(y_test, y_pred_adaboost)
recall = recall_score(y_test, y_pred_adaboost)
f1 = f1_score(y_test, y_pred_adaboost)

print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred_svm)

# Plot the confusion matrix
plt.figure(figsize=(4, 4))
sns.heatmap(cm, annot=True, fmt='d', cbar=False)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix For Support Vector Classifier')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred_adaboost)

# Plot the confusion matrix
plt.figure(figsize=(4, 4))
sns.heatmap(cm, annot=True, fmt='d', cbar=False)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix For SVM-AdaBoost')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# ROC Curve for SVM
fpr_svc, tpr_svc, _ = roc_curve(y_test, svm_model.predict_proba(X_test)[:,1])
roc_auc_svc = auc(fpr_svc, tpr_svc)
plt.plot(fpr_svc, tpr_svc, label='SVM ROC curve (area = %0.2f)' % roc_auc_svc)


# ROC Curve for AdaBoost
fpr_ada, tpr_ada, _ = roc_curve(y_test, adaboost_svm_model.predict_proba(X_test)[:,1])
roc_auc_ada = auc(fpr_ada, tpr_ada)
plt.plot(fpr_ada, tpr_ada, label='AdaBoost ROC curve (area = %0.2f)' % roc_auc_ada)


# Plot settings
plt.plot([0, 1], [0, 1], 'k--')  # Dashed diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()