In [5]:
import pandas as pd

df = pd.read_csv('dataset/logsdataset.csv')

## DATA

In [6]:
df

Unnamed: 0,timestamp,source,log_message,target_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status
...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error


In [7]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [8]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

## DBSCAN

The user wants to cluster log messages from a DataFrame named `df` using the DBSCAN algorithm. To achieve this, I will:
1.  Install and import the `sentence-transformers` library to generate vector embeddings for the text.
2.  Use a pre-trained model (e.g., `'all-MiniLM-L6-v2'`) to transform the `log_message` column into embeddings.
3.  Apply the `DBSCAN` algorithm from `scikit-learn` to cluster these embeddings.
4.  Store the resulting cluster labels back into the DataFrame.



In [9]:
!pip install sentence-transformers



In [10]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import numpy as np

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the log messages
embeddings = model.encode(df['log_message'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/76 [00:00<?, ?it/s]

In [11]:
# Initialize and fit DBSCAN
# eps and min_samples may need tuning based on the specific dataset density
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine')
clusters = dbscan.fit_predict(embeddings)

# Add cluster labels to the dataframe
df['cluster'] = clusters

# Display cluster distribution (Note: -1 represents noise)
print(df['cluster'].value_counts())

cluster
 0    1194
 1     902
 3     197
 4      58
 2      53
-1       6
Name: count, dtype: int64


In [12]:
df.head()


Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,1
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0


In [13]:
embeddings[:2]

array([[-1.02939673e-01,  3.35459784e-02, -2.20260806e-02,
         1.55102601e-03, -9.86915827e-03, -1.78956300e-01,
        -6.34410828e-02, -6.01761751e-02,  2.81108413e-02,
         5.99619783e-02, -1.72618497e-02,  1.43372838e-03,
        -1.49560034e-01,  3.15285241e-03, -5.66031151e-02,
         2.71685794e-02, -1.49890231e-02, -3.54038030e-02,
        -3.62936929e-02, -1.45410579e-02, -5.61500154e-03,
         8.75538066e-02,  4.55120578e-02,  2.50964463e-02,
         1.00187277e-02,  1.24266697e-02, -1.39923573e-01,
         7.68695846e-02,  3.14095095e-02, -4.15249076e-03,
         4.36902270e-02,  1.71250384e-02, -8.00950825e-02,
         5.74005730e-02,  1.89091824e-02,  8.55261683e-02,
         3.96399498e-02, -1.34371817e-01, -1.44371425e-03,
         3.06706317e-03,  1.76854134e-01,  4.44883108e-03,
        -1.69274695e-02,  2.24266555e-02, -4.35050204e-02,
         6.09027082e-03, -9.98169463e-03, -6.23972826e-02,
         1.07371965e-02, -6.04894804e-03, -7.14660361e-0

In [14]:
df[df.cluster == 1]

Unnamed: 0,timestamp,source,log_message,target_label,cluster
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,1
6,3/1/2025 19:14,ModernHR,Shard 6 replication task ended in failure,Error,1
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,1
11,6/15/2025 11:44,ModernHR,Critical system unit error: unit ID Component55,Critical Error,1
...,...,...,...,...,...
2398,3/31/2025 4:11,ModernHR,Potential vulnerability exploit detected from ...,Security Alert,1
2402,3/13/2025 9:44,BillingSystem,Replication error occurred for shard 10,Error,1
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,1
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1


Ab idhar kya hora ki bhai cluster 1 hi tha mera but usme achese ek tarah ke cluster nhi dikhre the merko to fir let's change eps and min_smaples value to tight out the boundary and see if we get a bit better clustering results.

In [15]:
dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)
df['cluster'] = clusters
print(df['cluster'].value_counts())

cluster
0      1017
5       147
11      100
13       86
7        60
       ... 
102       1
103       1
105       1
106       1
135       1
Name: count, Length: 136, dtype: int64


In [16]:
df[df.cluster == 1]

Unnamed: 0,timestamp,source,log_message,target_label,cluster
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,1
217,1/22/2025 5:45,BillingSystem,Mail service encountered a delivery glitch,Error,1
248,5/2/2025 23:04,ModernHR,Service disruption caused by email sending error,Critical Error,1
265,3/30/2025 23:53,ModernCRM,Email system had a problem sending emails,Error,1
361,11/19/2025 23:06,BillingSystem,Email service experienced a sending issue,Error,1
450,10/27/2025 5:59,ThirdPartyAPI,Email delivery system encountered an error,Error,1
477,12/2/2025 10:30,AnalyticsEngine,Email transmission error caused service impact,Critical Error,1
570,11/7/2025 18:08,ThirdPartyAPI,Email service impacted by sending failure,Critical Error,1
678,4/28/2025 15:13,AnalyticsEngine,Email delivery problem affected system,Critical Error,1


In [17]:
# ab jaake thode similar aaye , better h pehle se!

I will now summarize the task: I'll identify the clusters with more than 10 records, sort them by size in descending order, and then print 5 sample log messages for each of these significant clusters.



<llm-snippet-file>training.ipynb</llm-snippet-file>


In [18]:
# Identify clusters with more than 10 records and sort them by size
cluster_counts = df['cluster'].value_counts()
large_clusters = cluster_counts[cluster_counts > 10].index

# Iterate through sorted large clusters and print 5 sample log messages
for cluster_id in large_clusters:
    print(f"--- Cluster {cluster_id} (Size: {cluster_counts[cluster_id]}) ---")
    samples = df[df['cluster'] == cluster_id]['log_message'].head(5)
    for msg in samples:
        print(f"- {msg}")
    print("\n")


--- Cluster 0 (Size: 1017) ---
- nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
- nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
- nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
- nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2

## REGEX

now here i see patterns in some of the clusters with dynamic values. so i can create regex expression for their sorting now.

Based on the samples observed in the clusters, here is a function that uses regular expressions to classify those log messages.



In [19]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [20]:
classify_with_regex("USer User123 logged in.")

'User Action'

In [21]:
classify_with_regex("Hey you, chill bro")

In [22]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)
df[df.regex_label.isna()]

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0,
...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1,


In [23]:
df[df.regex_label.notna()]

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,4,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,11,User Action
...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,21,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,32,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,32,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,13,System Notification


In [24]:
df_non_regex = df[df['regex_label'].isna()].copy()
df_non_regex.shape

(1910, 6)

## to dekho abhi apan ne dekha ki regex ki madad se apan ne 500 to classify krliye h with help of regex
## ab baaki bache hue ka we will go with BERT OR LLM

We will use BERT if we have enough y values for that x. O/W we will go with llm bcs we dont have enough training data to teach our model so it's precison and all would be underfitted or overfitted.

In [25]:
# So now we will know which target value has less dataset/rows for it!

In [27]:
df_non_regex['target_label'].value_counts()

target_label
HTTP Status            1017
Security Alert          371
Error                   177
Resource Usage          177
Critical Error          161
Workflow Error            4
Name: count, dtype: int64

In [34]:
to_be_removed = df_non_regex[(df_non_regex['target_label'] == 'Workflow Error') |
                            (df_non_regex['target_label'] == 'Deprecation Warning')]
to_be_removed

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
60,2025-10-06 16:55:23,LegacyCRM,Lead conversion failed for prospect ID 7842 du...,Workflow Error,24,
255,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning,48,
377,2025-06-24 12:16:29,LegacyCRM,Customer follow-up process for lead ID 5621 fa...,Workflow Error,62,
1325,2025-04-17 07:33:44,LegacyCRM,Escalation rule execution failed for ticket ID...,Workflow Error,105,
1734,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning,118,
1826,2025-01-23 10:33:36,LegacyCRM,Support for legacy authentication methods will...,Deprecation Warning,122,
2217,2025-05-12 09:46:54,LegacyCRM,Task assignment for TeamID 3425 could not comp...,Workflow Error,133,


In [41]:
df_bert = df_non_regex.drop(to_be_removed.index)
df_bert.drop(columns=['regex_label'], inplace=True)
df_bert

Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0
...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,7
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,0
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1


In [42]:
df_bert[df_bert['source'] == 'LegacyCRM']

Unnamed: 0,timestamp,source,log_message,target_label,cluster


***To ab is dataset pe apan model ko train krenge!*** <br />
***why bert? bcs it captures context aware embedings! not just text aware embeddings (where only similarity of text is seen) ehre similar docs/sentences are grouped ccordingly!***

In [43]:
#1) EMBEDDING
bert_embeddings = model.encode(df_bert['log_message'].tolist(), show_progress_bar=True)
bert_embeddings[:2]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

array([[-1.02939673e-01,  3.35459784e-02, -2.20260806e-02,
         1.55102601e-03, -9.86915827e-03, -1.78956300e-01,
        -6.34410828e-02, -6.01761751e-02,  2.81108413e-02,
         5.99619783e-02, -1.72618497e-02,  1.43372838e-03,
        -1.49560034e-01,  3.15285241e-03, -5.66031151e-02,
         2.71685794e-02, -1.49890231e-02, -3.54038030e-02,
        -3.62936929e-02, -1.45410579e-02, -5.61500154e-03,
         8.75538066e-02,  4.55120578e-02,  2.50964463e-02,
         1.00187277e-02,  1.24266697e-02, -1.39923573e-01,
         7.68695846e-02,  3.14095095e-02, -4.15249076e-03,
         4.36902270e-02,  1.71250384e-02, -8.00950825e-02,
         5.74005730e-02,  1.89091824e-02,  8.55261683e-02,
         3.96399498e-02, -1.34371817e-01, -1.44371425e-03,
         3.06706317e-03,  1.76854134e-01,  4.44883108e-03,
        -1.69274695e-02,  2.24266555e-02, -4.35050204e-02,
         6.09027082e-03, -9.98169463e-03, -6.23972826e-02,
         1.07371965e-02, -6.04894804e-03, -7.14660361e-0

In [45]:
X = bert_embeddings
y = df_bert['target_label']

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

In [46]:
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.27,random_state=42)

## LOGISTIC REGRESSION

In [47]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train,y_train)

In [50]:
y_pred = clf.predict(X_test)

In [56]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

0.9883268482490273


In [53]:
confusion_matrix(y_pred,y_test)

array([[ 44,   5,   0,   0,   0],
       [  0,  36,   0,   0,   1],
       [  0,   0, 274,   0,   0],
       [  0,   0,   0,  45,   0],
       [  0,   0,   0,   0, 109]])

In [55]:
print(classification_report(y_pred,y_test))

                precision    recall  f1-score   support

Critical Error       1.00      0.90      0.95        49
         Error       0.88      0.97      0.92        37
   HTTP Status       1.00      1.00      1.00       274
Resource Usage       1.00      1.00      1.00        45
Security Alert       0.99      1.00      1.00       109

      accuracy                           0.99       514
     macro avg       0.97      0.97      0.97       514
  weighted avg       0.99      0.99      0.99       514



## KNN

In [57]:
knn_model = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn_model.fit(X_train,y_train)

In [59]:
y_pred_knn = knn_model.predict(X_test)

In [68]:
print(accuracy_score(y_pred_knn,y_test))
confusion_matrix(y_pred_knn,y_test)

0.9922178988326849


array([[ 43,   3,   0,   0,   0],
       [  1,  38,   0,   0,   0],
       [  0,   0, 274,   0,   0],
       [  0,   0,   0,  45,   0],
       [  0,   0,   0,   0, 110]])

In [62]:
print(classification_report(y_pred_knn,y_test))

                precision    recall  f1-score   support

Critical Error       0.98      0.93      0.96        46
         Error       0.93      0.97      0.95        39
   HTTP Status       1.00      1.00      1.00       274
Resource Usage       1.00      1.00      1.00        45
Security Alert       1.00      1.00      1.00       110

      accuracy                           0.99       514
     macro avg       0.98      0.98      0.98       514
  weighted avg       0.99      0.99      0.99       514



## SVM

In [63]:
svc_model = SVC(kernel='linear', C=1.0, class_weight='balanced', probability=True)

svc_model.fit(X_train, y_train)

In [65]:
y_pred_svm = svc_model.predict(X_test)
accuracy_score(y_pred_svm,y_test)

0.9922178988326849

In [66]:
print(confusion_matrix(y_pred_svm,y_test))

[[ 44   4   0   0   0]
 [  0  37   0   0   0]
 [  0   0 274   0   0]
 [  0   0   0  45   0]
 [  0   0   0   0 110]]


## CHOOSE FINAL MODEL

In [70]:
from sklearn.metrics import f1_score

print('LR : ' , f1_score(y_test, y_pred, average='macro'))
print('KNN : ' , f1_score(y_test, y_pred_knn, average='macro'))
print('SVM : ' , f1_score(y_test, y_pred_svm, average='macro'))

LR :  0.9729494544342092
KNN :  0.981111111111111
SVM :  0.9810479375696767


In [73]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression # Import Logistic Regression

# Assuming X and y are your full dataset (features) and labels (6 categories)
# Ensure X has been preprocessed (e.g., TF-IDF on logs, time features extracted/encoded)

k_folds = 5 # Standard number of folds

# --- Define your models within Pipelines ---

# 1. SVM Model Pipeline (using parameters from your screenshot)
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVC(kernel='linear', C=1.0, class_weight='balanced', probability=False))
])

# 2. KNN Model Pipeline (using parameters from previous discussion)
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', KNeighborsClassifier(n_neighbors=3, weights='distance', n_jobs=-1))
])

# 3. Logistic Regression Pipeline (multiclass setup with balanced weights)
logistic_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        solver='saga',          # 'saga' solver handles multiclass and L1/L2 penalties well
        multi_class='multinomial', # Explicitly set for 6 categories (softmax)
        class_weight='balanced',   # Handles imbalance like other models
        max_iter=1000,          # Increase iterations if it fails to converge
        n_jobs=-1
    ))
])

print("Running K-Fold Cross-Validation (k=5) for all three models...")

# --- Calculate scores for each model ---

# We use 'f1_macro' scoring to get a balanced F1 score across all 6 categories
svm_scores = cross_val_score(svm_pipeline, X, y, cv=k_folds, scoring='f1_macro')
knn_scores = cross_val_score(knn_pipeline, X, y, cv=k_folds, scoring='f1_macro')
log_scores = cross_val_score(logistic_pipeline, X, y, cv=k_folds, scoring='f1_macro') # Calculate Logistic scores

print(f"\nSVM F1-Macro Scores per fold:      {svm_scores}")
print(f"KNN F1-Macro Scores per fold:      {knn_scores}")
print(f"Logistic Reg F1-Macro Scores per fold: {log_scores}")


# --- Final Results Comparison (Average F1-Macro Score) ---

print("\n--- Final Results Comparison (Average F1-Macro Score) ---")
print(f"SVM Average F1-Score:           {np.mean(svm_scores):.4f} (+/- {np.std(svm_scores) * 2:.4f} range)")
print(f"KNN Average F1-Score:           {np.mean(knn_scores):.4f} (+/- {np.std(knn_scores) * 2:.4f} range)")
print(f"Logistic Reg Average F1-Score:  {np.mean(log_scores):.4f} (+/- {np.std(log_scores) * 2:.4f} range)")


Running K-Fold Cross-Validation (k=5) for all three models...





SVM F1-Macro Scores per fold:      [0.98814298 1.         0.99410618 1.         0.99548311]
KNN F1-Macro Scores per fold:      [0.98819444 0.9940085  0.97645022 1.         1.        ]
Logistic Reg F1-Macro Scores per fold: [0.98819444 1.         0.99410618 1.         0.99548311]

--- Final Results Comparison (Average F1-Macro Score) ---
SVM Average F1-Score:           0.9955 (+/- 0.0088 range)
KNN Average F1-Score:           0.9917 (+/- 0.0176 range)
Logistic Reg Average F1-Score:  0.9956 (+/- 0.0088 range)


***Interpretation:***
<br />
1) Logistic Regression wins technically, but the SVM is a virtually identical tie. Their average scores are almost the same (0.9956 vs 0.9955), and their stability (range) is identical.
<br />
2) KNN performs slightly worse on average and is less stable (a wider range), meaning it's slightly more prone to overfitting on specific folds of data.

## SAVE MODEL

In [74]:
import warnings
warnings.filterwarnings('ignore')

In [75]:
import joblib
joblib.dump(clf, '../models/LR_model.pkl')

['LR_model.pkl']

In [76]:
joblib.dump(svc_model, '../models/SVM_model.pkl')

['models/SVM_model.pkl']

In [77]:
joblib.dump(clf, '../models/LR_model.joblib')

['../models/LR_model.joblib']