Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


**Collecting the Data**

In [None]:
from sklearn.datasets import load_breast_cancer
breast_cancer_data=load_breast_cancer()
breast_cancer_data

**Separating the breast cancer data from the file**

In [None]:
bc_data=breast_cancer_data.data
bc_data

**Separating the Features of the breast cancer from the fil**e

In [None]:
features_name=breast_cancer_data.feature_names.tolist()
features_name

**Target Variable**

In [None]:
target=breast_cancer_data['target_names']
target

Converting the whole data into DataFrame

In [None]:
df=pd.DataFrame(data=np.c_[breast_cancer_data['data'],breast_cancer_data['target']],columns=np.append(breast_cancer_data['feature_names'],['target']))

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

**Feature Engineering with matplotlib and seaborn**

In [None]:
sns.heatmap(df,annot=True)
plt.plot()

In [None]:
sns.lmplot(data=df)

In [None]:

figure=plt.figure(figsize=(9,9))
plt.plot()
plt.show()
sns.catplot(data=df)

**Preprocessing steps**

In [None]:
x=df.drop(['target'],axis=1)
x

In [None]:
y=df['target']
y

**Scikit Learn Library**

In [None]:
from sklearn.model_selection import train_test_split,RandomizedSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,test_size=0.3,random_state=50)

**Feature Scalling**

In [None]:
sc=StandardScaler()
x_train_std=sc.fit_transform(x_train)
x_test_std=sc.fit_transform(x_test)

**Model Training on RandomForest model**

In [None]:
r_f=RandomForestClassifier()
r_f.fit(x_train,y_train)
y_predict=r_f.predict(x_test)
accuracy_score(y_test,y_predict)


**Support Vector model**

In [None]:
r_r=SVC()
r_r.fit(x_train,y_train)
y_predict_std=r_r.predict(x_test)
accuracy_score(y_test,y_predict_std)

support vector on scaling data

In [None]:
r_r=SVC()
r_r.fit(x_train_std,y_train)
y_predict_std=r_r.predict(x_test_std)
accuracy_score(y_test,y_predict_std)

training the logistic model

In [None]:
l_r=LogisticRegression()
l_r.fit(x_train_std,y_train)
l_r_y_predict=l_r.predict(x_test_std)
accuracy_score(y_test,l_r_y_predict)

**Training the K_nearest model**

In [None]:
k_n=KNeighborsClassifier()
k_n.fit(x_train,y_train)
k_n_y_predict=k_n.predict(x_test)
accuracy_score(y_test,k_n_y_predict)
# can also be trained on scaled data

**Training the model on Naive Bayes modl**

In [None]:
g_c=GaussianNB()
# g_c.fit(x_train,y_train)
g_c.fit(x_train_std,y_train) #on scaled data 
g_c_y_predict=g_c.predict(x_test_std)
accuracy_score(y_test,g_c_y_predict)

TRaining the tree model

In [None]:
d_t_c=DecisionTreeClassifier()
d_t_c.fit(x_train,y_train)
#d_t_c.fit(x_train_std,y_train) #training on scaled data
d_t_c_y_predict=d_t_c.predict(x_test)
accuracy_score(y_test,d_t_c_y_predict)

**XGboost Classifier model training**

In [None]:
x_g_c=XGBClassifier()
# x_g_c.fit(x_train,y_train)
x_g_c.fit(x_train_std,y_train)#training on scaled data
x_g_c_y_predict=x_g_c.predict(x_test_std)
accuracy_score(y_test,x_g_c_y_predict)

**Parametrs Tunning of the highest accuracy model**

In [None]:
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] 
}
params_tung=RandomizedSearchCV(x_g_c, param_distributions=params, scoring= 'roc_auc', n_jobs= -1, verbose= 3)
params_tung.fit(x_train,y_train)

In [None]:
params_tung.best_params_

In [None]:
tung_predct=params_tung.predict(x_test)
accuracy_score(y_test,tung_predct)

In [None]:
params_tung.best_estimator_


**Confusion Matrix**

In [None]:
cm=confusion_matrix(y_test,l_r_y_predict)
sns.heatmap(cm,annot=True,linewidths=4)
plt.plot()

**classification Report**

In [None]:
cr=classification_report(y_test,l_r_y_predict)
print(cr)

**Cross validation score or underfitting or overfitting**

In [None]:
cross_validation = cross_val_score(estimator = r_r, X = x_train, y = y_train, cv = 10)
print("Cross validation of XGBoost model = ",cross_validation)
print("Cross validation of XGBoost model (in mean) = ",cross_validation.mean())


In [None]:
x_train.shape

**Saving the model**

In [None]:
import pickle
#saving the model
saveing_model=pickle.dump(r_f,open('breast_cancer_model.pickle','wb'))
#loading the model
load_model=pickle.load(open('breast_cancer_model.pickle','rb'))
#predicting the value
r_r_y_predict1=load_model.predict(x_test)
#confusion matrix
print(confusion_matrix(y_test,r_r_y_predict1))
#accuracy score
print('accuracy score is :',accuracy_score(y_test,r_r_y_predict1))