In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB
import warnings

# To ignore all warnings, set the filter to "ignore"
warnings.filterwarnings("ignore")

In [3]:
data_file = "./breast+cancer+wisconsin+original/breast-cancer-wisconsin.data"


In [4]:
column_names = ["Sample Code", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape",
                "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin",
                "Normal Nucleoli", "Mitoses", "Class"]

In [5]:
data = pd.read_csv(data_file, names=column_names)

In [6]:
data.head()

Unnamed: 0,Sample Code,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [7]:
data = data.drop(columns=[column_names[0]])

In [8]:
null_counts = data.isin(['?']).sum()

# Print the counts of missing values for each column
print("Missing Value Counts:")
print(null_counts)

Missing Value Counts:
Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64


In [9]:
# Replace '?' with NaN in the entire DataFrame
data = data.replace('?', np.nan)

In [10]:
# Remove rows containing NaN values (including rows with '?')
data_cleaned = data.dropna()

In [11]:
data.shape

(699, 10)

In [12]:
data_cleaned.shape

(683, 10)

In [13]:
X = data_cleaned.drop("Class", axis=1)
y = data_cleaned["Class"]

In [14]:
X.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [15]:
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64

In [16]:
# Split the data into a 60:40 ratio for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=13, stratify=y)


## KNN

In [17]:
# Create a k-NN classifier with a specified value of k (e.g., k=3)
k = 3
knn_classifier = KNeighborsClassifier(n_neighbors=k)


In [18]:
# Perform k-fold cross-validation ( k=5)

n_splits = 5
num_folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4)
cv_scores = cross_val_score(knn_classifier, X, y, cv=num_folds, scoring='accuracy')

# Print cross-validation results
meanAcc = np.mean(cv_scores)
maxAcc = 0
for i, score in enumerate(cv_scores):
    if maxAcc < score:
        maxAcc = score
        
percent = (meanAcc-maxAcc) * 100

print(f"Cross-Validation Scores (Accuracy): {cv_scores}")
print(f"Mean Accuracy: {meanAcc}")
print(f"Standard Deviation: {np.std(cv_scores)}")

Cross-Validation Scores (Accuracy): [0.97080292 0.97810219 0.97080292 0.98529412 0.97058824]
Mean Accuracy: 0.9751180764276514
Standard Deviation: 0.005834678450946536


In [19]:
knn_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

Accuracy: 0.9708029197080292
Classification Report:
              precision    recall  f1-score   support

           2       0.98      0.97      0.98       178
           4       0.95      0.97      0.96        96

    accuracy                           0.97       274
   macro avg       0.97      0.97      0.97       274
weighted avg       0.97      0.97      0.97       274



## LogisticRegression

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

# Load the Breast Cancer Wisconsin dataset or your own dataset
# Replace 'data_file.csv' with your dataset file path
data = data_cleaned

# Separate data into features (X) and target labels (y)
X = data.drop('Class', axis=1)
y = data['Class']

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=13, stratify=y)

# Create a Logistic Regression classifier
logistic_classifier = LogisticRegression( max_iter=500)

# Define a dictionary of hyperparameters and their possible values
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga','newton-cholesky','sag'],
    'class_weight': [None, 'balanced']
}


In [21]:
# Create a GridSearchCV object with the logistic classifier and hyperparameter grid
grid_search = GridSearchCV(logistic_classifier, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding cross-validated score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validated Accuracy:", grid_search.best_score_)

Best Hyperparameters: {'C': 0.01, 'class_weight': None, 'penalty': 'l2', 'solver': 'saga'}
Best Cross-Validated Accuracy: 0.9681722372779283


In [22]:

# Create a Logistic Regression classifier
logistic_classifier = LogisticRegression(C= 0.01, class_weight= None, penalty= "l2", solver= 'saga')

# Perform k-fold cross-validation ( k=5)
n_splits = 5
num_folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4)
cv_scores = cross_val_score(logistic_classifier, X, y, cv=num_folds, scoring='accuracy')

# Print cross-validation results
meanAcc = np.mean(cv_scores)
maxAcc = 0
for i, score in enumerate(cv_scores):
    if maxAcc < score:
        maxAcc = score
        
percent = (meanAcc-maxAcc) * 100

print(f"Cross-Validation Scores (Accuracy): {cv_scores}")
print(f"Mean Accuracy: {meanAcc}")
print(f"Standard Deviation: {np.std(cv_scores)}")

Cross-Validation Scores (Accuracy): [0.96350365 0.95620438 0.97810219 0.96323529 0.95588235]
Mean Accuracy: 0.963385573207385
Standard Deviation: 0.008055844906130632


In [23]:
# Fit the classifier to the training data
logistic_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = logistic_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

Accuracy: 0.9708029197080292
Classification Report:
              precision    recall  f1-score   support

           2       0.98      0.98      0.98       178
           4       0.96      0.96      0.96        96

    accuracy                           0.97       274
   macro avg       0.97      0.97      0.97       274
weighted avg       0.97      0.97      0.97       274



## SVC

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the Breast Cancer Wisconsin dataset or your own dataset
# Replace 'data_file.csv' with your dataset file path
data = data_cleaned

# Separate data into features (X) and target labels (y)
X = data.drop('Class', axis=1)
y = data['Class']

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=13, stratify=y)

# Create an SVM classifier
svc_classifier = SVC()  # You can choose different kernels such as 'linear', 'rbf', or 'poly'

# Define a dictionary of hyperparameters and their possible values
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.01, 0.1, 1, 'scale', 'auto'],
    'class_weight': ['balanced', None],
    'probability': [True, False]
}

In [25]:
# Create a GridSearchCV object with the SVC classifier and hyperparameter grid
grid_search = GridSearchCV(svc_classifier, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding cross-validated score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validated Accuracy:", grid_search.best_score_)

Best Hyperparameters: {'C': 0.01, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'linear', 'probability': True}
Best Cross-Validated Accuracy: 0.970581150255947


In [26]:
svc_classifier = SVC(C=0.01, class_weight = 'balanced', gamma = 0.01, kernel='linear', probability= True)
# Perform 5-fold cross-validation on the training data
num_folds = 5
cv_scores = cross_val_score(svc_classifier, X, y, cv=num_folds)

# Print cross-validation results
meanAcc = np.mean(cv_scores)
maxAcc = 0
for i, score in enumerate(cv_scores):
    if maxAcc < score:
        maxAcc = score
        
percent = (meanAcc-maxAcc) * 100

print(f"Cross-Validation Scores (Accuracy): {cv_scores}")
print(f"Mean Accuracy: {meanAcc}")
print(f"Standard Deviation: {np.std(cv_scores)}")


Cross-Validation Scores (Accuracy): [0.95620438 0.94890511 0.98540146 0.98529412 0.97794118]
Mean Accuracy: 0.9707492486045514
Standard Deviation: 0.015275365563899886


 class sklearn.svm.SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)

In [27]:
# Fit the classifier to the training data
svc_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svc_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

Accuracy: 0.9817518248175182
Classification Report:
              precision    recall  f1-score   support

           2       1.00      0.97      0.99       178
           4       0.95      1.00      0.97        96

    accuracy                           0.98       274
   macro avg       0.98      0.99      0.98       274
weighted avg       0.98      0.98      0.98       274



In [None]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
import numpy as np

# Create an instance of the Logistic Regression Classifier
logistic_model = LogisticRegression()

# Specify the number of cross-validation folds (K) - e.g., 5-fold cross-validation
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4)

# Define the hyperparameter grid for RandomizedSearchCV
param_dist = {
    'C': [0.1, 1, 10],  # Inverse of regularization strength
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear', 'saga'],  # Algorithm to use in the optimization problem
    'max_iter': [100, 200, 300],  # Maximum number of iterations for the solver
    'random_state': [13]  # Random seed for reproducibility
}

# Create RandomizedSearchCV with the Logistic Regression model and parameter grid
random_search = RandomizedSearchCV(
    logistic_model,
    param_distributions=param_dist,
    n_iter=500,  # Number of random combinations to try
    cv=cv,
    scoring='accuracy',
    random_state=13,
    n_jobs=-1  # Use all available CPU cores for parallel processing
)

# Perform Randomized Search to find the best hyperparameters
random_search.fit(X, y)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the Logistic Regression model with the best hyperparameters
best_logistic_model = LogisticRegression(**best_params)

# Perform K-fold cross-validation and get the classification scores
scores = cross_val_score(best_logistic_model, X, y, cv=cv, scoring='accuracy')

# Print the accuracy for each fold
print("Cross-Validation Accuracy Scores:")
for i, score in enumerate(scores):
    print(f"Fold {i+1}: {score:.6f}")

# Calculate and print the mean accuracy and standard deviation
mean_accuracy = np.mean(scores)
std_accuracy = np.std(scores)
print(f"Mean Accuracy: {mean_accuracy:.6f}")
print(f"Standard Deviation: {std_accuracy:.6f}")


In [None]:
from sklearn.model_selection import cross_val_predict


# Initialize lists to store precision and recall values for each fold
precision_per_fold = []
recall_per_fold = []

# Perform K-fold cross-validation and get the predicted labels for each fold
predicted_labels = cross_val_predict(best_logistic_model, X, y, cv=cv)

# Calculate and print accuracy, precision, and recall for each fold and class
print("Cross-Validation Metrics:")
for i, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    accuracy = accuracy_score(y[test_idx], predicted_labels[test_idx])
    precision = precision_score(y[test_idx], predicted_labels[test_idx], average=None)
    recall = recall_score(y[test_idx], predicted_labels[test_idx], average=None)
    
    print(f"Fold {i + 1}:")
    print(f"  Accuracy: {accuracy:.6f}")
    
    for class_idx, (prec, rec) in enumerate(zip(precision, recall)):
        print(f"  Class {class_idx + 1}:")
        print(f"    Precision: {prec:.6f}")
        print(f"    Recall: {rec:.6f}")
        print()
    
    # Append precision and recall values for this fold to the lists
    precision_per_fold.append(precision)
    recall_per_fold.append(recall)

# Calculate and print the mean accuracy, precision, and recall across all folds
mean_accuracy = np.mean(scores)

# Calculate mean precision and recall for each class
mean_precision_per_class = np.mean(precision_per_fold, axis=0)
mean_recall_per_class = np.mean(recall_per_fold, axis=0)

# Calculate standard deviation of accuracy, precision, and recall across folds
std_accuracy = np.std(scores)
std_precision_per_class = np.std(precision_per_fold, axis=0)
std_recall_per_class = np.std(recall_per_fold, axis=0)

print("Mean Metrics:")
print(f"Mean Accuracy: {mean_accuracy:.6f}")

for class_idx, (mean_prec, mean_rec) in enumerate(zip(mean_precision_per_class, mean_recall_per_class)):
    print(f"Class {class_idx + 1}:")
    print(f"  Mean Precision: {mean_prec:.6f}")
    print(f"  Mean Recall: {mean_rec:.6f}")
    print()

print("Standard Deviation Metrics:")
print(f"Standard Deviation Accuracy: {std_accuracy:.6f}")

for class_idx, (std_prec, std_rec) in enumerate(zip(std_precision_per_class, std_recall_per_class)):
    print(f"Class {class_idx + 1}:")
    print(f"  Standard Deviation Precision: {std_prec:.6f}")
    print(f"  Standard Deviation Recall: {std_rec:.6f}")


In [None]:
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
import numpy as np

# Create an instance of the SVC Classifier
svc_model = SVC()

# Specify the number of cross-validation folds (K) - e.g., 5-fold cross-validation
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4)

# Define the hyperparameter grid for RandomizedSearchCV
param_dist = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
    'degree': [2, 3, 4],  # Degree of the polynomial kernel (only for 'poly' kernel)
    'gamma': ['scale', 'auto'] + [0.001, 0.01, 0.1, 1, 10],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid' kernels
    'shrinking': [True, False],  # Whether to use the shrinking heuristic
    'class_weight': [None, 'balanced'],  # Weights associated with classes
    'decision_function_shape': ['ovr', 'ovo'],  # Decision function shape
    'random_state': [42]  # Random seed for reproducibility
}

# Create RandomizedSearchCV with the SVC model and parameter grid
random_search = RandomizedSearchCV(
    svc_model,
    param_distributions=param_dist,
    n_iter=10,  # Number of random combinations to try
    cv=cv,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1  # Use all available CPU cores for parallel processing
)

# Perform Randomized Search to find the best hyperparameters
random_search.fit(X, y)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the SVC model with the best hyperparameters
best_svc_model = SVC(**best_params)

# Perform K-fold cross-validation and get the classification scores
scores = cross_val_score(best_svc_model, X, y, cv=cv, scoring='accuracy')

# Print the accuracy for each fold
print("Cross-Validation Accuracy Scores:")
for i, score in enumerate(scores):
    print(f"Fold {i+1}: {score:.6f}")

# Calculate and print the mean accuracy and standard deviation
mean_accuracy = np.mean(scores)
std_accuracy = np.std(scores)
print(f"Mean Accuracy: {mean_accuracy:.6f}")
print(f"Standard Deviation: {std_accuracy:.6f}")
