In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix


In [23]:
# Paths to all 4 datasets
file_paths = {
    "first": "first.csv",
    "second": "second.csv",
    "third": "third.csv",
    "fourth": "fourth.csv"
}


In [24]:
# Extracts X (features) and Y (target) from dataframe
def extract_X_Y(df):
    X = df[["x", "y"]].values
    Y = df["result"].values
    return X, Y


In [25]:
def retrain_model(df, classifier_type='knn', test_size=0.4):
    X, Y = extract_X_Y(df)

    # Stratified train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=test_size, stratify=Y, random_state=42
    )

    # Choose classifier
    if classifier_type == 'knn':
        model = KNeighborsClassifier(n_neighbors=3)
    elif classifier_type == 'dummy':
        model = DummyClassifier(strategy='most_frequent')
    else:
        raise ValueError("Classifier type must be 'knn' or 'dummy'")

    # Train and predict
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluation results
    print(f"\n Classifier: {classifier_type.upper()} | Dataset: {len(X)} samples")
    print("Accuracy :", round(accuracy_score(y_test, y_pred), 3))
    print("Recall   :", round(recall_score(y_test, y_pred, average='macro'), 3))
    print("Precision:", round(precision_score(y_test, y_pred, average='macro'), 3))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    return model


In [26]:
# Retrain and evaluate using both classifiers for each dataset
for name, path in file_paths.items():
    print(f"\n=== Retraining on {name}.csv ===")
    df = pd.read_csv(path)
    retrain_model(df, classifier_type='knn')
    retrain_model(df, classifier_type='dummy')



=== Retraining on first.csv ===

 Classifier: KNN | Dataset: 1000 samples
Accuracy : 0.885
Recall   : 0.885
Precision: 0.885
Confusion Matrix:
 [[174  23]
 [ 23 180]]

 Classifier: DUMMY | Dataset: 1000 samples
Accuracy : 0.507
Recall   : 0.5
Precision: 0.254
Confusion Matrix:
 [[  0 197]
 [  0 203]]

=== Retraining on second.csv ===

 Classifier: KNN | Dataset: 1000 samples
Accuracy : 0.9
Recall   : 0.9
Precision: 0.9
Confusion Matrix:
 [[181  20]
 [ 20 179]]

 Classifier: DUMMY | Dataset: 1000 samples
Accuracy : 0.502
Recall   : 0.5
Precision: 0.251
Confusion Matrix:
 [[201   0]
 [199   0]]

=== Retraining on third.csv ===

 Classifier: KNN | Dataset: 1000 samples
Accuracy : 0.963
Recall   : 0.925
Precision: 0.956
Confusion Matrix:
 [[316   4]
 [ 11  69]]

 Classifier: DUMMY | Dataset: 1000 samples
Accuracy : 0.8
Recall   : 0.5
Precision: 0.4
Confusion Matrix:
 [[320   0]
 [ 80   0]]

=== Retraining on fourth.csv ===

 Classifier: KNN | Dataset: 1000 samples
Accuracy : 0.955
Recall 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
# Retrain and evaluate using both classifiers for each dataset
for name, path in file_paths.items():
    print(f"\n=== Retraining on {name}.csv ===")
    df = pd.read_csv(path)
    retrain_model(df, classifier_type='knn')
    retrain_model(df, classifier_type='dummy')



=== Retraining on first.csv ===

 Classifier: KNN | Dataset: 1000 samples
Accuracy : 0.885
Recall   : 0.885
Precision: 0.885
Confusion Matrix:
 [[174  23]
 [ 23 180]]

 Classifier: DUMMY | Dataset: 1000 samples
Accuracy : 0.507
Recall   : 0.5
Precision: 0.254
Confusion Matrix:
 [[  0 197]
 [  0 203]]

=== Retraining on second.csv ===

 Classifier: KNN | Dataset: 1000 samples
Accuracy : 0.9
Recall   : 0.9
Precision: 0.9
Confusion Matrix:
 [[181  20]
 [ 20 179]]

 Classifier: DUMMY | Dataset: 1000 samples
Accuracy : 0.502
Recall   : 0.5
Precision: 0.251
Confusion Matrix:
 [[201   0]
 [199   0]]

=== Retraining on third.csv ===

 Classifier: KNN | Dataset: 1000 samples
Accuracy : 0.963
Recall   : 0.925
Precision: 0.956
Confusion Matrix:
 [[316   4]
 [ 11  69]]

 Classifier: DUMMY | Dataset: 1000 samples
Accuracy : 0.8
Recall   : 0.5
Precision: 0.4
Confusion Matrix:
 [[320   0]
 [ 80   0]]

=== Retraining on fourth.csv ===

 Classifier: KNN | Dataset: 1000 samples
Accuracy : 0.955
Recall 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




This section implements a `retrain_model()` function to support user-requested retraining using either a KNN classifier or a Dummy classifier.

Key Features:
- Supports `'knn'` and `'dummy'` classifiers.
- Performs stratified train-test splitting to ensure balanced class distribution.
- Automatically loops through all 4 datasets: `first.csv`, `second.csv`, `third.csv`, `fourth.csv`.
- Evaluates performance using:
  - Accuracy
  - Recall
  - Precision
  - Confusion Matrix

## User-Controlled Online Retraining

This section prompts the user to confirm whether retraining should proceed.  
If the user types yes, the system loops through all datasets and retrains using both KNN and Dummy classifiers with stratified splits.  
Otherwise, it skips retraining.



In [29]:
#  user_input = "yes"  # set to "no" if you want to skip retraining

if user_input == "yes":
    for name, path in file_paths.items():
        print(f"\n=== Retraining on {name}.csv ===")
        try:
            df = pd.read_csv(path)
            retrain_model(df, classifier_type='knn')
            retrain_model(df, classifier_type='dummy')
        except Exception as e:
            print(f" Error loading {name}.csv:", e)
else:
    print(" Retraining skipped by user request.")



=== Retraining on first.csv ===

 Classifier: KNN | Dataset: 1000 samples
Accuracy : 0.885
Recall   : 0.885
Precision: 0.885
Confusion Matrix:
 [[174  23]
 [ 23 180]]

 Classifier: DUMMY | Dataset: 1000 samples
Accuracy : 0.507
Recall   : 0.5
Precision: 0.254
Confusion Matrix:
 [[  0 197]
 [  0 203]]

=== Retraining on second.csv ===

 Classifier: KNN | Dataset: 1000 samples
Accuracy : 0.9
Recall   : 0.9
Precision: 0.9
Confusion Matrix:
 [[181  20]
 [ 20 179]]

 Classifier: DUMMY | Dataset: 1000 samples
Accuracy : 0.502
Recall   : 0.5
Precision: 0.251
Confusion Matrix:
 [[201   0]
 [199   0]]

=== Retraining on third.csv ===

 Classifier: KNN | Dataset: 1000 samples
Accuracy : 0.963
Recall   : 0.925
Precision: 0.956
Confusion Matrix:
 [[316   4]
 [ 11  69]]

 Classifier: DUMMY | Dataset: 1000 samples
Accuracy : 0.8
Recall   : 0.5
Precision: 0.4
Confusion Matrix:
 [[320   0]
 [ 80   0]]

=== Retraining on fourth.csv ===

 Classifier: KNN | Dataset: 1000 samples
Accuracy : 0.955
Recall 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
