In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Load dataset
df = pd.read_csv("mutagenicity_kNN.csv")

# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# Fill missing values only in numeric columns
df.fillna(df.select_dtypes(include=[np.number]).mean(numeric_only=True), inplace=True)

# Encode categorical columns if present
for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Ensure target column is binary
target_col = df.columns[-1]
print(f"Unique target values: {df[target_col].unique()}")

if len(df[target_col].unique()) > 2:
    print("Target variable is not binary. Switching to 'f1_macro' for scoring.")
    scoring_metric = 'f1_macro'
else:
    scoring_metric = 'f1'

# Assume the last column is the target variable (mutagenicity label)
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]    # Target

# Convert target variable to integer type
y = y.astype(int)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Transform training data
X_test = scaler.transform(X_test)  # Transform test data

# Hyperparameter tuning using GridSearchCV
param_grid = {'n_neighbors': range(1, 21)}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, scoring=scoring_metric, cv=5)

# Ensure no missing values in training set
print("Missing values in X_train:", np.isnan(X_train).sum())
print("Missing values in y_train:", np.isnan(y_train).sum())

# Fit model
grid_search.fit(X_train, y_train)

# Best k value
best_k = grid_search.best_params_['n_neighbors']
print(f"Best k: {best_k}")

# Train final model
knn_best = KNeighborsClassifier(n_neighbors=best_k)
knn_best.fit(X_train, y_train)

# Predictions
y_pred = knn_best.predict(X_test)

# Evaluate model
f1 = f1_score(y_test, y_pred, average='macro')  # Use 'macro' for multi-class
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')

print(f"F1-score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


Missing values per column:
 Unnamed: 0             0
Id                     0
CAS                    0
SMILES                 0
Status                 0
Experimental value     0
Predicted value        0
NumValenceElectrons    0
qed                    0
TPSA                   0
MolMR                  0
BalabanJ               0
BertzCT                0
MolWt                  0
MolLogP                0
dtype: int64
Unique target values: [2.2482  1.177   1.3004  ... 1.3658  3.84768 4.61982]
Target variable is not binary. Switching to 'f1_macro' for scoring.
Missing values in X_train: 0
Missing values in y_train: 0




Best k: 1
F1-score: 0.2594
Accuracy: 0.5265
Precision: 0.2697
Recall: 0.2557


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
