In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB

In [2]:
data_file = "breast-cancer-wisconsin.data"


In [3]:
column_names = ["Sample Code", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape",
                "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin",
                "Normal Nucleoli", "Mitoses", "Class"]

In [4]:
data = pd.read_csv(data_file, names=column_names)

In [5]:
data.head()

Unnamed: 0,Sample Code,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [6]:
data = data.drop(columns=[column_names[0]])

In [7]:
null_counts = data.isin(['?']).sum()

# Print the counts of missing values for each column
print("Missing Value Counts:")
print(null_counts)

Missing Value Counts:
Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64


In [8]:
# Replace '?' with NaN in the entire DataFrame
data = data.replace('?', np.nan)

In [9]:
# Remove rows containing NaN values (including rows with '?')
data_cleaned = data.dropna()

In [10]:
data.shape

(699, 10)

In [11]:
data_cleaned.shape

(683, 10)

In [12]:
X = data_cleaned.drop("Class", axis=1)
y = data_cleaned["Class"]

In [13]:
# Split the data into a 60:40 ratio for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)


In [14]:
# Create a k-NN classifier with a specified value of k (e.g., k=3)
k = 3
knn_classifier = KNeighborsClassifier(n_neighbors=k)


In [23]:
# Perform k-fold cross-validation ( k=5)

n_splits = 5
num_folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4)
cv_scores = cross_val_score(knn_classifier, X, y, cv=num_folds, scoring='accuracy')


In [37]:
# Print cross-validation results
meanAcc = np.mean(cv_scores)
maxAcc = 0
for i, score in enumerate(cv_scores):
    if maxAcc < score:
        maxAcc = score
        
percent = (meanAcc-maxAcc) * 100

print(f"Cross-Validation Scores (Accuracy): {cv_scores}")
print(f"Mean Accuracy: {meanAcc}")
print(f"Max Accuracy: {maxAcc}")
print(f"Standard Deviation: {np.std(cv_scores)}")
print(f"Accracy Percentage: {meanAcc*100} {percent }")


Cross-Validation Scores (Accuracy): [0.97080292 0.97810219 0.97080292 0.98529412 0.97058824]
Mean Accuracy: 0.9751180764276514
Max Accuracy: 0.9852941176470589
Standard Deviation: 0.005834678450946536
Accracy Percentage: 97.51180764276513 -1.0176041219407517


In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the Breast Cancer Wisconsin dataset or your own dataset
# Replace 'data_file.csv' with your dataset file path
data = data_cleaned

# Separate data into features (X) and target labels (y)
X = data.drop('Class', axis=1)
y = data['Class']

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, )

# Create a Logistic Regression classifier
logistic_classifier = LogisticRegression()

# Fit the classifier to the training data
logistic_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = logistic_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Perform k-fold cross-validation ( k=5)
n_splits = 5
num_folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4)
cv_scores = cross_val_score(knn_classifier, X, y, cv=num_folds, scoring='accuracy')


print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)


Accuracy: 0.9562043795620438
Classification Report:
              precision    recall  f1-score   support

           2       0.94      0.99      0.96        79
           4       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137



## SVC

In [49]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the Breast Cancer Wisconsin dataset or your own dataset
# Replace 'data_file.csv' with your dataset file path
data = data_cleaned

# Separate data into features (X) and target labels (y)
X = data.drop('Class', axis=1)
y = data['Class']

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create an SVM classifier
svm_classifier = SVC(kernel='rbf',degree=4)  # You can choose different kernels such as 'linear', 'rbf', or 'poly'

# Perform 5-fold cross-validation on the training data
num_folds = 5
cv_scores = cross_val_score(svm_classifier, X, y, cv=num_folds)

# Print cross-validation results
print(f"Cross-Validation Scores (Accuracy): {cv_scores}")
print(f"Mean Accuracy: {np.mean(cv_scores)}")
print(f"Standard Deviation: {np.std(cv_scores)}")

# Fit the classifier to the entire training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("\nTest Set Performance:")
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)


Cross-Validation Scores (Accuracy): [0.91970803 0.94890511 0.97810219 0.98529412 0.98529412]
Mean Accuracy: 0.9634607127522543
Standard Deviation: 0.025666830317824022

Test Set Performance:
Accuracy: 0.9635036496350365
Classification Report:
              precision    recall  f1-score   support

           2       0.96      0.98      0.97       174
           4       0.97      0.93      0.95       100

    accuracy                           0.96       274
   macro avg       0.96      0.96      0.96       274
weighted avg       0.96      0.96      0.96       274

