In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn import set_config

set_config(transform_output="pandas")

In [3]:
df = pd.read_csv("./dataset/swarm_training.csv")
unknown_behaviour = pd.read_csv("./dataset/unknown_swarm_behaviour.csv")
df.head()

Unnamed: 0,yC187,yC148,yC7,yC111,yC32,xC54,xC148,yC193,xC61,yC81,...,yS14,xS2,yS6,yC65,xC18,nS64,yC166,nS55,yS122,Swarm_Behaviour
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,24.75,50.0,100.0,0.0,0.0,18.0,1.0,0.0,...,0.0,0.0,-0.79,12.5,-9.0,1.0,12.5,0.0,-1.19,1.0
4,0.0,50.0,24.75,50.0,100.0,0.0,-1.2,17.5,1.0,99.0,...,0.0,0.0,0.0,12.5,-2.25,1.0,12.5,0.0,-1.41,1.0


In [4]:
y = df["Swarm_Behaviour"]
X = df.drop(columns=["Swarm_Behaviour"])

In [5]:
# corr_matrix = X.corr()
# fig, ax = plt.subplots(figsize=(10,10))
# sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", ax=ax)
# plt.show()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
preprocessing_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
    ]
)

In [8]:
logistic_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessing_pipeline),
        ("classifier", LogisticRegression(max_iter=10000, random_state=42))
    ]
)

cross_val_score(logistic_pipeline, X_train, y_train, cv=5, scoring="precision_macro").mean()

0.8757697932701923

In [9]:
# Enhance model performance using Cross Validation
param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10],
    "classifier__penalty": ["l1", "l2"],
    "classifier__class_weight": ["balanced", None],
    "preprocessing__imputer__strategy": ["mean", "median"],
    "classifier__solver": ["liblinear"]
}

from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(logistic_pipeline, param_distributions=param_grid, n_iter=50, cv=5, scoring="precision")
random_search.fit(X_train, y_train)



In [10]:
print("Best parameters found: ", random_search.best_params_)
print("Best precision score: ", random_search.best_score_)

Best parameters found:  {'preprocessing__imputer__strategy': 'mean', 'classifier__solver': 'liblinear', 'classifier__penalty': 'l1', 'classifier__class_weight': None, 'classifier__C': 0.01}
Best precision score:  0.8919779339348857


In [11]:
y_pred = random_search.predict(X_test)
f1 = f1_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print((f1, accuracy, precision, recall))

(0.8747465107956578, 0.8971428571428571, 0.8601036269430051, 0.7867298578199052)


In [27]:
y_pred = random_search.predict(X_test)
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))

0.8856731152466505
0.8657575669467624


In [29]:
y_pred_new_samples = random_search.predict(unknown_behaviour)
output_df = pd.DataFrame(y_pred_new_samples, columns=["Is_Swarm"])
print(output_df)

      Is_Swarm
0          1.0
1          0.0
2          0.0
3          1.0
4          0.0
...        ...
2326       0.0
2327       1.0
2328       1.0
2329       0.0
2330       0.0

[2331 rows x 1 columns]


In [51]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV


pipeline = Pipeline(steps=[ ("preprocessing", preprocessing_pipeline), ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))])

param_grid = {
    'xgb__max_depth': [3, 4, 5, 10],
    'xgb__learning_rate': [0.01, 0.1, 0.3, 0.5],
    'xgb__n_estimators': [10, 30, 50, 100]
}

random_search = GridSearchCV(pipeline, param_grid, cv=10, scoring="f1")
random_search.fit(X_train, y_train)    
print("Best score:", random_search.best_score_)
print("Best parameters:", random_search.best_params_)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best score: 0.8648496596789232
Best parameters: {'xgb__learning_rate': 0.3, 'xgb__max_depth': 3, 'xgb__n_estimators': 50}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [53]:
y_pred = random_search.predict(X_test)
f1 = f1_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print((f1, accuracy, precision, recall))

(0.8830627726897528, 0.9028571428571428, 0.8557213930348259, 0.8151658767772512)


In [55]:
y_pred = random_search.predict(X_test)
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))

0.8887825402047878
0.8779305866503843
