In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA

# Load dataset
file_path = 'data (2).csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
data = data.drop(columns=['id', 'Unnamed: 32'])

# Map target values to binary (M = 1, B = 0)
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})

# Split dataset into features and target
X = data.drop(columns=['diagnosis'])
y = data['diagnosis']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Models to evaluate
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': GaussianNB()
}

# Dictionary to store the results
results = {}

# Train, predict and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict on the test set

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Store the results
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }

# Print the results
for model, metrics in results.items():
    print(f"Results for {model}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Re-train models with PCA-applied data
print("\nResults after PCA:")
for name, model in models.items():
    model.fit(X_train_pca, y_train)
    y_pred_pca = model.predict(X_test_pca)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred_pca)
    precision = precision_score(y_test, y_pred_pca)
    recall = recall_score(y_test, y_pred_pca)
    f1 = f1_score(y_test, y_pred_pca)

    # Print the results after PCA
    print(f"\nResults for {name} after PCA:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Results for Decision Tree:
  Accuracy: 0.9240
  Precision: 0.8788
  Recall: 0.9206
  F1-Score: 0.8992
Results for KNN:
  Accuracy: 0.9591
  Precision: 0.9828
  Recall: 0.9048
  F1-Score: 0.9421
Results for Random Forest:
  Accuracy: 0.9708
  Precision: 0.9833
  Recall: 0.9365
  F1-Score: 0.9593
Results for Naive Bayes:
  Accuracy: 0.9415
  Precision: 0.9344
  Recall: 0.9048
  F1-Score: 0.9194

Results after PCA:

Results for Decision Tree after PCA:
  Accuracy: 0.9240
  Precision: 0.8676
  Recall: 0.9365
  F1-Score: 0.9008

Results for KNN after PCA:
  Accuracy: 0.9591
  Precision: 0.9516
  Recall: 0.9365
  F1-Score: 0.9440

Results for Random Forest after PCA:
  Accuracy: 0.9532
  Precision: 0.9365
  Recall: 0.9365
  F1-Score: 0.9365

Results for Naive Bayes after PCA:
  Accuracy: 0.9181
  Precision: 1.0000
  Recall: 0.7778
  F1-Score: 0.8750
