In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from scipy.stats import randint as sp_randint

# Load dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Feature selection
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

# Encode categorical variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'].fillna('S', inplace=True)
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Impute missing age values
imputer = SimpleImputer(strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])

# Prepare data
X = df[features]
y = df[target]

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define model
knn = KNeighborsClassifier()

# Define hyperparameter space
param_dist = {
    'n_neighbors': sp_randint(3, 30),
    'weights': ['uniform', 'distance'],
    'p': [1, 2],  # 1: Manhattan, 2: Euclidean
    'leaf_size': sp_randint(10, 50)
}

# Random search with 5-fold CV
random_search = RandomizedSearchCV(knn, param_distributions=param_dist,
                                   n_iter=20, cv=5, scoring='accuracy',
                                   random_state=42, n_jobs=-1)

# Fit model
random_search.fit(X_train, y_train)

# Best model
best_knn = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

# Cross-validation score of best model
cv_scores = cross_val_score(best_knn, X_train, y_train, cv=5)
print("Cross-validation Accuracy: %.4f ± %.4f" % (cv_scores.mean(), cv_scores.std()))

# Final evaluation on test set
y_pred = best_knn.predict(X_test)
print("\nTest Set Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna('S', inplace=True)


Best Parameters: {'leaf_size': 31, 'n_neighbors': 14, 'p': 1, 'weights': 'uniform'}
Cross-validation Accuracy: 0.8216 ± 0.0208

Test Set Performance:
Accuracy: 0.8100558659217877
Confusion Matrix:
 [[95 10]
 [24 50]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.90      0.85       105
           1       0.83      0.68      0.75        74

    accuracy                           0.81       179
   macro avg       0.82      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179

