In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from matplotlib.colors import ListedColormap
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline


In [None]:
url='https://raw.githubusercontent.com/jarif87/DataSets/main/uci_breast_cancer.csv'
dataset=pd.read_csv(url)
dataset

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset.isnull().sum()

In [None]:
dataset.shape

In [None]:
dataset.drop(["id","Unnamed: 32"],axis=1,inplace=True)

In [None]:
dataset.head()

In [None]:
correlation=dataset.corr()
plt.figure(figsize=(20,15))
sns.heatmap(correlation,annot=True,fmt=".2f",cmap="jet")
plt.suptitle("THE CORRELATION HEAT MAP OF THE BREAST CANCER DATA")
plt.show()


In [None]:
plt.figure(figsize=(15,10))
axes=sns.countplot(x='diagnosis',data=dataset,palette='Dark2')
for p in axes.patches:
    height = p.get_height()
    axes.annotate(f'{height:.0f}', (p.get_x() + p.get_width() / 2, height), ha='center', va='bottom', fontsize=12)
plt.suptitle('DIAGNOSIS PLOT')
plt.ylabel('Count')
plt.xlabel('Diagnosis')
plt.show()

In [None]:
lableencoder=LabelEncoder()
dataset['diagnosis'] = lableencoder.fit_transform(dataset['diagnosis'])
dataset.head()

In [None]:
X=dataset.drop("diagnosis",axis=1)
y=dataset["diagnosis"]

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)


In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
y_test

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm=print(cm)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
classification_rep = classification_report(y_test, y_pred)
classification_rep

In [None]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
X, y = make_blobs(n_samples=100, centers=2, n_features=100, cluster_std=20)
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
model = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 1)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm=print(cm)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
classification_rep = classification_report(y_test, y_pred)
classification_rep

In [None]:
model = KNeighborsClassifier(n_neighbors = 100, metric = 'minkowski', p = 1)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm=print(cm)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
model = SVC(kernel = 'linear', random_state = 0)
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
model = SVC(kernel = 'rbf', random_state = 0)
model.fit(X_train, y_train)


In [None]:
y_pred=model.predict(X_test)


In [None]:
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
model = SVC(kernel = 'sigmoid', random_state = 0)
model.fit(X_train, y_train)

In [None]:
y_pred=model.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
model = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_pred,y_test)
cm

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
model.fit(X_train, y_train)

In [None]:
y_pred=model.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_pred,y_test)
cm

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
model = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
y_pred

In [None]:
cm=confusion_matrix(y_pred,y_test)
cm

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=300,
                                 learning_rate=0.05,
                                 random_state=100,
                                 max_features=5 )
gbc.fit(X_train,y_train)

In [None]:
y_pred=gbc.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_pred,y_test)
cm

In [None]:
accuracy_score(y_pred,y_test)

In [None]:
from xgboost import XGBClassifier
model= XGBClassifier()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
model=AdaBoostClassifier(n_estimators=100, random_state=42)
model.fit(X_train,y_train)

In [None]:
y_pred=model.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
accuracy_score(y_pred,y_test)

THE HIGHEST ACCURACY IS ACHIEVED BY THE SVM MODEL WITH THE HYPERPARAMETER TUNING HAVING THE N_SAMPLES AS 100...SO THE SVM MODEL IS MOST SUITABLE FOR THE PREDICTION OF THIS DATSET OF BREAST CANCER.