<a href="https://colab.research.google.com/github/AI-fanatic24/Breast-Cancer-Classification/blob/Diagnostic-model/Diagnostic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import LabelEncoder

In [None]:
pip install ucimlrepo

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

# metadata
print(breast_cancer_wisconsin_diagnostic.metadata)

# variable information
print(breast_cancer_wisconsin_diagnostic.variables)


In [None]:
df = pd.concat([X,y],axis=1)
df

In [None]:
df.info()

In [None]:
df.describe()

**CHECKING PRESENCE OF NULL VALUES**

In [None]:
#checking presence of null values
df.isnull().sum()

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
sns.heatmap(df.isnull())
plt.title("HEATMAP OF NULL VALUES",fontdict={'family':'serif','fontsize':15,'color':'black','fontweight':'bold'})
plt.show()

**VISUALIZING OUTLIERS**

In [None]:
sns.set(rc={'figure.figsize':(20,20)})
for i in range(len(df.columns[:-1])):
  plt.subplot(6,5,(i+1))
  sns.boxplot(x=df.iloc[:,i])

plt.tight_layout()

plt.show()

In [None]:
sns.set(rc={'figure.figsize':(20,20)})
for i in range(len(df.columns[:-1])):
  plt.subplot(6,5,(i+1))
  sns.histplot(x=df.iloc[:,i],bins=20,kde=True)

plt.tight_layout()

plt.show()

In [None]:
columns = df.columns
print("----MEDIAN OF FEATURES----")
for i in range(len(df.columns[:-1])):
  print(columns[i],":",df.iloc[:,i].median())

NOW, LABEL ENCODING THE TARGET VARIABLE

In [None]:
#Label Encoding as:
# 1 = Malignant
# 0 = Benign

le = LabelEncoder()
df['Diagnosis'] = le.fit_transform(df['Diagnosis'])
df

In [None]:
df['Diagnosis'].value_counts()

In [None]:
fontdict = {'family':'serif','fontsize':10,'color':'black','fontweight':'bold'}

In [None]:
sns.set(rc={'figure.figsize':(5,4)})
sns.countplot(df,x='Diagnosis')
plt.title("COUNT OF MALIGNANT (CLASS 1) AND BENIGN (CLASS 0) TUMORS",fontdict=fontdict)
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(30,30)})
corr = df.corr()
sns.heatmap(corr,annot=True,cmap='coolwarm')
plt.title("CORRELATION MAP",fontdict={'family':'serif','fontsize':40,'color':'black','fontweight':'bold'})
plt.show()

In [None]:
df.columns

**VISUALIZING TRENDS OF DIAGNOSIS ACROSS VARIOUS FEATURES**

In [None]:
sns.set(rc={'figure.figsize':(40,40)})
sns.pairplot(df[['radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1','compactness1', 'concavity1', 'concave_points1', 'symmetry1',
       'fractal_dimension1','Diagnosis']],hue='Diagnosis')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(40,40)})
sns.pairplot(df[['radius2', 'texture2', 'perimeter2', 'area2',
       'smoothness2', 'compactness2', 'concavity2', 'concave_points2',
       'symmetry2', 'fractal_dimension2','Diagnosis']],hue='Diagnosis')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(40,40)})
sns.pairplot(df[['radius3', 'texture3', 'perimeter3',
       'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3',
       'symmetry3', 'fractal_dimension3','Diagnosis']],hue='Diagnosis')
plt.show()

In [None]:
df.to_csv('breast_cancer_preprocessed.csv',index=False)

In [None]:
features = df.columns[:-1]
features

In [None]:
target = df.columns[-1]
target

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,StratifiedKFold
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

**SPLITTING THE DATA INTO 80% TRAINING DATA AND 20% TESTING DATA**

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df[features],df[target],test_size=0.20,random_state=23)
print("X_train",X_train.shape)
print("X_test",X_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

In [None]:
train = pd.concat([X_train,y_train],axis=1)
train

**CHECKING IMBALANCE IN TRAINING DATA**

In [None]:
train['Diagnosis'].value_counts()

In [None]:
sns.set(rc={'figure.figsize':(5,4)})
sns.countplot(train,x='Diagnosis')
plt.title("COUNT OF MALIGNANT (CLASS 1) AND BENIGN (CLASS 0) TUMORS IN TRAINING DATA",fontdict=fontdict)
plt.show()

**HANDLING IMBALANCE IN TRAINING DATA THROUGH SMOTE**

In [None]:
smote = SMOTE()

In [None]:
np.random.seed(123)
X_train,y_train = smote.fit_resample(X_train,y_train)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
X_train

In [None]:
y_train_df = pd.DataFrame(y_train)
y_train_df

In [None]:
labels,values = np.unique(y_train,return_counts=True)
print(labels)
print(values)

In [None]:
sns.set(rc={'figure.figsize':(5,4)})
sns.countplot(y_train_df,x='Diagnosis')
plt.title("COUNT OF MALIGNANT (CLASS 1) AND BENIGN (CLASS 0) TUMORS IN TRAINING DATA AFTER SMOTE",fontdict=fontdict)
plt.show()
print()
print("CLASS 0:",values[0])
print("CLASS 1:",values[1])

**NORMALIZING THE DATA**

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled

In [None]:
X_train.shape

In [None]:
train_data = pd.DataFrame(X_train_scaled)
train_data

In [None]:
train_data.to_csv('X_train.csv',index=False)

In [None]:
y_train_df = pd.DataFrame(y_train)
y_train_df

In [None]:
y_train_df.to_csv('y_train.csv',index=False)

In [None]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled

In [None]:
X_test_scaled.shape

In [None]:
X_test_scaled_df = pd.DataFrame(X_test_scaled)
X_test_scaled_df

In [None]:
X_test_scaled_df.to_csv('X_test.csv',index=False)

In [None]:
y_test_df = pd.DataFrame(y_test)

In [None]:
y_test_df.to_csv('y_test.csv',index=False)

In [None]:
y_test_df.value_counts()

In [None]:
sns.countplot(y_test_df,x='Diagnosis')
plt.title("COUNT OF MALIGNANT (CLASS 1) AND BENIGN (CLASS 0) TUMORS IN TESTING DATA",fontdict=fontdict)
plt.show()

**LOGISTIC REGRESSION MODEL**

In [None]:
model = LogisticRegression(random_state=12)
param_grid = {'penalty':['l1', 'l2', 'elasticnet',None],'C':[0.001,0.01,0.1,1,10,20,40,60,80,100],'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}

In [None]:
model

In [None]:
model.fit(X_train_scaled,y_train)
prediction = model.predict(X_train_scaled)
prediction

In [None]:
print(accuracy_score(y_train,prediction))

In [None]:
print(confusion_matrix(y_train,prediction))

In [None]:
prediction = model.predict(X_test_scaled)
prediction

In [None]:
print(accuracy_score(y_test,prediction))

In [None]:
print(confusion_matrix(y_test,prediction))

In [None]:
cv = StratifiedKFold(shuffle=True)

In [None]:
grid = GridSearchCV(estimator=model,param_grid=param_grid,cv=cv,refit=True,verbose=3,scoring='accuracy')
grid.fit(X_train_scaled,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
best_model = grid.best_estimator_
best_model

In [None]:
print("Training Accuracy")
print(accuracy_score(y_train,best_model.predict(X_train_scaled)))

In [None]:
print("Training Confusion Matrix")
print(confusion_matrix(y_train,best_model.predict(X_train_scaled)))

In [None]:
print("Training Classification Report")
print(classification_report(y_train,best_model.predict(X_train_scaled)))

In [None]:
prediction = best_model.predict(X_test_scaled)
prediction

In [None]:
print("Testing Accuracy")
print(accuracy_score(y_test,prediction))

In [None]:
print("Testing Confusion Matrix")
print(confusion_matrix(y_test,prediction))

In [None]:
print("Testing Classification Report")
print(classification_report(y_test,prediction))

SAVING THE LOGISTIC REGRESSION MODEL

In [None]:
import joblib

In [None]:
joblib.dump(best_model, 'logistic_regression_model.joblib')

In [None]:
loaded_model_logistic = joblib.load('logistic_regression_model.joblib')

**RANDOM FOREST**

In [None]:
classifier = RandomForestClassifier(random_state=14)
param_grid = {'n_estimators':[20,40,60,80,100],'criterion':['gini', 'entropy', 'log_loss'],'max_depth':[1,2,3,4,None],'min_samples_split':[1,2,3],'min_samples_leaf':[1,2,3],'max_features':['sqrt', 'log2', None]}

In [None]:
classifier.fit(X_train_scaled,y_train)
prediction = classifier.predict(X_train_scaled)
prediction

In [None]:
print(accuracy_score(y_train,prediction))

In [None]:
print(confusion_matrix(y_train,prediction))

In [None]:
prediction = classifier.predict(X_test_scaled)
prediction

In [None]:
print(accuracy_score(y_test,prediction))

In [None]:
print(confusion_matrix(y_test,prediction))

In [None]:
cv = StratifiedKFold(shuffle=True)

In [None]:
grid = GridSearchCV(estimator=classifier,param_grid=param_grid,cv=cv,refit=True,verbose=3,scoring='accuracy')
grid.fit(X_train_scaled,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
best_model = grid.best_estimator_
best_model

In [None]:
print("Training Accuracy")
print(accuracy_score(y_train,best_model.predict(X_train_scaled)))

In [None]:
print("Training Confusion Matrix")
print(confusion_matrix(y_train,best_model.predict(X_train_scaled)))

In [None]:
print("Training Classification Report")
print(classification_report(y_train,best_model.predict(X_train_scaled)))

In [None]:
prediction = best_model.predict(X_test_scaled)
prediction

In [None]:
print("Testing Accuracy")
print(accuracy_score(y_test,prediction))

In [None]:
print("Testing Confusion Matrix")
print(confusion_matrix(y_test,prediction))

In [None]:
print("Testing Classification Report")
print(classification_report(y_test,prediction))

SAVING THE RANDOM FOREST MODEL

In [None]:
joblib.dump(best_model, 'random_forest_model.joblib')

In [None]:
loaded_model_random_forest = joblib.load('random_forest_model.joblib')

**XGBOOST**

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(random_state=14)
xgb.fit(X_train_scaled,y_train)

In [None]:
prediction = xgb.predict(X_train_scaled)
prediction

In [None]:
print(accuracy_score(y_train,prediction))

In [None]:
print(confusion_matrix(y_train,prediction))

In [None]:
prediction = xgb.predict(X_test_scaled)
prediction

In [None]:
print(accuracy_score(y_test,prediction))

In [None]:
print(confusion_matrix(y_test,prediction))

In [None]:
cv = StratifiedKFold(shuffle=True)

In [None]:
param_grid = {'learning_rate':[0.2,0.3,0.4,0.5],'gamma':[0,1,4],'max_depth':[2,4,6,8],'min_child_weight':[0,2,4],'subsample':[0.5,1],'lambda':[0.2,0.5,1]}

In [None]:
grid = GridSearchCV(estimator=xgb,param_grid=param_grid,cv=cv,refit=True,verbose=3,scoring='accuracy')
grid.fit(X_train_scaled,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
best_model = grid.best_estimator_
best_model

In [None]:
print("Training Accuracy")
print(accuracy_score(y_train,best_model.predict(X_train_scaled)))

In [None]:
print("Training Confusion Matrix")
print(confusion_matrix(y_train,best_model.predict(X_train_scaled)))

In [None]:
print("Training Classification Report")
print(classification_report(y_train,best_model.predict(X_train_scaled)))

In [None]:
prediction = best_model.predict(X_test_scaled)
prediction

In [None]:
print("Testing Accuracy")
print(accuracy_score(y_test,prediction))

In [None]:
print("Testing Confusion Matrix")
print(confusion_matrix(y_test,prediction))

In [None]:
print("Testing Classification Report")
print(classification_report(y_test,prediction))

SAVING THE EXTREME GRADIENT BOOSTING (XGB) MODEL

In [None]:
joblib.dump(best_model, 'xgb_model.joblib')

In [None]:
loaded_model_xgb = joblib.load('xgb_model.joblib')

**SVM**

In [None]:
from sklearn.svm import SVC

In [None]:
svm = SVC()
svm.fit(X_train_scaled,y_train)

In [None]:
prediction = svm.predict(X_train_scaled)
prediction

In [None]:
print(accuracy_score(y_train,prediction))

In [None]:
print(confusion_matrix(y_train,prediction))

In [None]:
prediction = svm.predict(X_test_scaled)
prediction

In [None]:
print(accuracy_score(y_test,prediction))

In [None]:
print(confusion_matrix(y_test,prediction))

In [None]:
param_grid = {'C':[0.5,1,1.5,2,3],'kernel':['linear', 'poly', 'rbf', 'sigmoid'],'degree':[1,2,3,4],'gamma':[0.5,1,2,'scale', 'auto']}

In [None]:
cv = StratifiedKFold(shuffle=True)

In [None]:
grid = GridSearchCV(estimator=svm,param_grid=param_grid,cv=cv,refit=True,verbose=3,scoring='accuracy')
grid.fit(X_train_scaled,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
best_model = grid.best_estimator_
best_model

In [None]:
print("Training Accuracy")
print(accuracy_score(y_train,best_model.predict(X_train_scaled)))

In [None]:
print("Training Confusion Matrix")
print(confusion_matrix(y_train,best_model.predict(X_train_scaled)))

In [None]:
print("Training Classification Report")
print(classification_report(y_train,best_model.predict(X_train_scaled)))

In [None]:
prediction = best_model.predict(X_test_scaled)
prediction

In [None]:
print("Testing Accuracy")
print(accuracy_score(y_test,prediction))

In [None]:
print("Testing Confusion Matrix")
print(confusion_matrix(y_test,prediction))

In [None]:
print("Testing Classification Report")
print(classification_report(y_test,prediction))

SAVING THE SUPPORT VECTOR MACHINE (SVM) MODEL

In [None]:
joblib.dump(best_model, 'svm_model.joblib')

In [None]:
loaded_model_svm = joblib.load('svm_model.joblib')

**OUTCOME:**
**Extreme Gradient Boosting (XGB) model gives the best performance.**