### BREAST CANCER CLASSIFICATION

In [14]:
#importing the dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [18]:
import pickle 

In [None]:
#importing the dataset
data = pd.read_csv('Breast_cancer_data.csv')
data.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0


In [3]:
data.shape

(569, 6)

In [4]:
data.isnull().sum()

mean_radius        0
mean_texture       0
mean_perimeter     0
mean_area          0
mean_smoothness    0
diagnosis          0
dtype: int64

In [5]:
X = data.drop(['diagnosis'],axis=1)
Y = data['diagnosis']

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [16]:
# Define a list of models to evaluate
models = [
    LogisticRegression(),
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GaussianNB()
]

In [17]:
# Train and evaluate each model
results = {}
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[type(model).__name__] = accuracy

# Print the results
for model_name, accuracy in results.items():
    print(f"{model_name}: Accuracy = {accuracy:.4f}")

# Find the best model
best_model_name = max(results, key=results.get)
best_accuracy = results[best_model_name]
print(f"\nBest Model: {best_model_name}, Accuracy = {best_accuracy:.4f}")

LogisticRegression: Accuracy = 0.9298
SVC: Accuracy = 0.9211
KNeighborsClassifier: Accuracy = 0.9386
DecisionTreeClassifier: Accuracy = 0.8947
RandomForestClassifier: Accuracy = 0.9649
GaussianNB: Accuracy = 0.9386

Best Model: RandomForestClassifier, Accuracy = 0.9649


In [20]:

print(f"\nBest Model: {best_model_name}, Accuracy = {best_accuracy:.4f}")

# Now, find the best model object from the 'models' list based on its name
for model in models:
  if type(model).__name__ == best_model_name:
    best_model = model
    break

# Train the best model on the entire dataset (optional, but often improves performance)
best_model.fit(X, Y)

# Save the best model using pickle
filename = 'Breast_Cancer_Classifier.sav'
pickle.dump(best_model, open(filename, 'wb')) 


Best Model: RandomForestClassifier, Accuracy = 0.9649
