In [4]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
import time
import warnings

# Suppress the FutureWarning related to 'n_init' in KMeans
warnings.filterwarnings("ignore", category=FutureWarning)

print("1. Required libraries imported.")

# Load data
print("2. Loading data...")
data = pd.read_csv("train.csv")
print("Data loaded successfully.")

# Split features and target variables
print("3. Splitting features and target variables...")
X = data.drop(columns=['id', 'target'])
y = data['target']
print("Features and target variables split successfully.")

# Split the data into training and testing sets
print("4. Splitting the data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split into training and testing sets successfully.")

# Feature scaling
print("5. Scaling features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Feature scaling completed.")

# K-means clustering
print("6. Performing K-means clustering...")
# Explicitly set 'n_init' to suppress the FutureWarning
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
start_time = time.time()
kmeans.fit(X_train_scaled)
end_time = time.time()
training_time_kmeans = end_time - start_time
print("K-means clustering completed.")

# Predict cluster labels
print("7. Predicting cluster labels...")
train_clusters = kmeans.predict(X_train_scaled)
test_clusters = kmeans.predict(X_test_scaled)
print("Cluster labels predicted.")

# Calculate AUC scores using cluster labels
print("8. Calculating AUC scores...")
auc_kmeans_train = roc_auc_score(y_train, train_clusters)
auc_kmeans_test = roc_auc_score(y_test, test_clusters)
print(f"Train AUC Score: {auc_kmeans_train:.4f}")
print(f"Test AUC Score: {auc_kmeans_test:.4f}")
print(f"Training Time (seconds): {training_time_kmeans:.4f}")

1. Required libraries imported.
2. Loading data...
Data loaded successfully.
3. Splitting features and target variables...
Features and target variables split successfully.
4. Splitting the data into training and testing sets...
Data split into training and testing sets successfully.
5. Scaling features...
Feature scaling completed.
6. Performing K-means clustering...
K-means clustering completed.
7. Predicting cluster labels...
Cluster labels predicted.
8. Calculating AUC scores...
Train AUC Score: 0.5835
Test AUC Score: 0.5893
Training Time (seconds): 5.0377
