In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [17]:
label = [
    "M", "UM", "UM", "UM", "M", "M", "UM", "M", "M", "UM",
    "M", "M", "M", "M", "UM", "UM", "UM", "UM", "UM", "M",
    "UM", "M", "UM", "M", "M", "M", "UM", "UM", "M", "M",
    None, None, None, None, None, None, None, None, None, None, None,
    "UM", "UM", "UM", "UM", "M", "M", "M", "M", "UM", "M"]

In [19]:
cluster_proportion_cll = pd.read_csv("../../data/clusters_proportions.csv", index_col = 0)

In [21]:
cluster_proportion_cll["label"] = label

In [29]:
df = cluster_proportion_cll[cluster_proportion_cll['label'].isin(["UM", "M"])]

In [41]:
# Assuming df is your DataFrame
# Separate features (numeric columns) and target (categorical label)
X = df.drop(columns=['label'])  # Drop the label column to get only features
y = df['label']  # Target variable

# Encode the categorical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Convert labels to numeric form

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Normalize feature data for better KNN performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)  # You can tune 'n_neighbors' for better performance
knn.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = knn.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.6250
              precision    recall  f1-score   support

           M       0.67      0.50      0.57         4
          UM       0.60      0.75      0.67         4

    accuracy                           0.62         8
   macro avg       0.63      0.62      0.62         8
weighted avg       0.63      0.62      0.62         8



In [45]:
# Assuming df is your DataFrame
# Separate features (numeric columns) and target (categorical label)
X = df.drop(columns=['label'])  # Drop the label column to get only features
y = df['label']  # Target variable

# Encode the categorical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Convert labels to numeric form

# Normalize feature data for better KNN performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train KNN classifier on the entire dataset
knn = KNeighborsClassifier(n_neighbors=3)  # You can tune 'n_neighbors' for better performance
knn.fit(X_scaled, y_encoded)

# Predict on the same dataset
y_pred = knn.predict(X_scaled)

# Evaluate the model
accuracy = accuracy_score(y_encoded, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_encoded, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.7500
              precision    recall  f1-score   support

           M       0.76      0.76      0.76        21
          UM       0.74      0.74      0.74        19

    accuracy                           0.75        40
   macro avg       0.75      0.75      0.75        40
weighted avg       0.75      0.75      0.75        40

