In [183]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix

In [184]:
df=pd.read_csv(r"D:\Breast_cancer project\notebook\data\breast_cancer.csv")

In [185]:
df.head()

Unnamed: 0.1,Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Diagnosis
0,0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [186]:
df.drop(["Unnamed: 0"],axis=1,inplace=True)

In [187]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [188]:
df.shape

(569, 31)

In [189]:
X=df.drop(["Diagnosis"],axis=1)
y=df["Diagnosis"]

In [190]:
from sklearn.pipeline import Pipeline


In [191]:
# Segregating numerical and categorical variables
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [192]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [193]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [194]:
numerical_pipeline

In [195]:
categorical_pipeline

In [196]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)])


In [197]:
preprocessor

In [198]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [199]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [200]:
feature_names = list(preprocessor.get_feature_names_out())

In [201]:
x_train=pd.DataFrame(X_train,columns=feature_names)
x_train.head()

Unnamed: 0,num__mean radius,num__mean texture,num__mean perimeter,num__mean area,num__mean smoothness,num__mean compactness,num__mean concavity,num__mean concave points,num__mean symmetry,num__mean fractal dimension,...,num__worst radius,num__worst texture,num__worst perimeter,num__worst area,num__worst smoothness,num__worst compactness,num__worst concavity,num__worst concave points,num__worst symmetry,num__worst fractal dimension
0,-1.440753,-0.435319,-1.362085,-1.139118,0.780573,0.718921,2.823135,-0.11915,1.092662,2.458173,...,-1.232861,-0.476309,-1.24792,-0.973968,0.722894,1.186732,4.672828,0.932012,2.097242,1.88645
1,1.974096,1.733026,2.091672,1.851973,1.319843,3.426275,2.013112,2.665032,2.127004,1.558396,...,2.173314,1.311279,2.081617,2.137405,0.761928,3.265601,1.928621,2.698947,1.891161,2.497838
2,-1.399982,-1.249622,-1.345209,-1.109785,-1.332645,-0.307355,-0.365558,-0.696502,1.930333,0.954379,...,-1.295284,-1.040811,-1.24522,-0.999715,-1.438693,-0.548564,-0.644911,-0.970239,0.597602,0.057894
3,-0.981797,1.416222,-0.982587,-0.866944,0.05939,-0.596788,-0.820203,-0.845115,0.313264,0.074041,...,-0.829197,1.59353,-0.873572,-0.742947,0.796624,-0.729392,-0.77495,-0.809483,0.798928,-0.134497
4,-1.1177,-1.010259,-1.125002,-0.965942,1.269511,-0.439002,-0.983341,-0.9306,3.394436,0.950213,...,-1.085129,-1.334616,-1.117138,-0.896549,-0.174876,-0.995079,-1.209146,-1.354582,1.033544,-0.205732


In [202]:
feature_names = list(preprocessor.get_feature_names_out())

In [203]:
x_test=pd.DataFrame(X_test,columns=feature_names)

In [204]:
x_test.head()

Unnamed: 0,num__mean radius,num__mean texture,num__mean perimeter,num__mean area,num__mean smoothness,num__mean compactness,num__mean concavity,num__mean concave points,num__mean symmetry,num__mean fractal dimension,...,num__worst radius,num__worst texture,num__worst perimeter,num__worst area,num__worst smoothness,num__worst compactness,num__worst concavity,num__worst concave points,num__worst symmetry,num__worst fractal dimension
0,-0.466497,-0.137289,-0.444211,-0.486465,0.28085,0.041606,-0.111465,-0.264869,0.415241,0.135137,...,-0.263235,-0.147842,-0.331548,-0.351093,0.480019,-0.096496,-0.03583,-0.194351,0.172757,0.20373
1,1.365363,0.498665,1.305511,1.341471,-0.406539,-0.013724,0.240637,0.821449,-0.833981,-1.131215,...,1.794619,0.172372,1.763661,1.744141,-0.530514,-0.12362,-0.028181,0.991779,-0.561211,-1.008389
2,0.380066,0.06922,0.404101,0.266596,0.96752,0.356414,0.726902,0.857221,0.437094,-0.666053,...,0.629403,0.076638,0.533832,0.492044,1.000466,-0.086163,0.499625,0.57035,-0.107831,-0.206293
3,-0.486317,-0.353185,-0.42857,-0.526233,0.69429,0.533852,-0.144722,-0.533686,4.8e-05,1.147386,...,-0.698111,-0.433394,-0.524721,-0.636959,0.584109,0.065603,-0.163957,-0.620377,-0.553285,0.545322
4,-0.72981,-1.113514,-0.709283,-0.709281,0.294512,0.159898,-0.271202,-0.587608,0.025542,0.703052,...,-0.827117,-0.966535,-0.849575,-0.739243,0.128718,-0.264407,-0.453677,-0.689644,-0.913135,-0.141789


In [205]:
models={
    'Random Forest':RandomForestClassifier(),
    'Logistic Regression':LogisticRegression(),
    'Decision Tree':DecisionTreeClassifier(),
    'KNeighbors Classifier':KNeighborsClassifier(n_neighbors=5,algorithm="auto"),
   }

In [211]:
def evaluate_model(x_train,y_train,x_test,y_test,models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(X_train,y_train)
        y_test_pred =model.predict(X_test)
        test_model_score = accuracy_score(y_test,y_test_pred)
        report[list(models.keys())[i]] =  test_model_score
    return report

In [212]:
evaluate_model(x_train,y_train,x_test,y_test,models)

{'Random Forest': 0.9649122807017544,
 'Logistic Regression': 0.9736842105263158,
 'Decision Tree': 0.9385964912280702,
 'KNeighbors Classifier': 0.9473684210526315}

In [213]:
classfier=RandomForestClassifier()

In [214]:
from sklearn.model_selection import RandomizedSearchCV

In [216]:
params={'max_depth':[3,5,10,None],
            'n_estimators':[100,200,300],
            'criterion':['gini','entropy']
            }

In [217]:
cv=RandomizedSearchCV(classfier,param_distributions=params,scoring='accuracy',cv=5,verbose=3)
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.978 total time=   0.2s
[CV 2/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.945 total time=   0.1s
[CV 3/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.978 total time=   0.2s
[CV 4/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.945 total time=   0.2s
[CV 5/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.923 total time=   0.2s
[CV 1/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.956 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.934 total time=   0.6s
[CV 3/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.967 total time=   0.5s
[CV 4/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.934 total time=   0.5s
[CV 5/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.934 total time=   0.5s
[CV 

In [218]:
cv.best_params_

{'n_estimators': 300, 'max_depth': 10, 'criterion': 'entropy'}

In [219]:
model = RandomForestClassifier(n_estimators=300,criterion="entropy")

In [220]:
model.fit(x_train,y_train)

In [221]:
y_prediction=model.predict(x_test)
accuracy=accuracy_score(y_test, y_prediction)
classification_report=classification_report(y_test, y_prediction)
confusion_matrix=confusion_matrix(y_test,y_prediction)

In [224]:
x_test.head()

Unnamed: 0,num__mean radius,num__mean texture,num__mean perimeter,num__mean area,num__mean smoothness,num__mean compactness,num__mean concavity,num__mean concave points,num__mean symmetry,num__mean fractal dimension,...,num__worst radius,num__worst texture,num__worst perimeter,num__worst area,num__worst smoothness,num__worst compactness,num__worst concavity,num__worst concave points,num__worst symmetry,num__worst fractal dimension
0,-0.466497,-0.137289,-0.444211,-0.486465,0.28085,0.041606,-0.111465,-0.264869,0.415241,0.135137,...,-0.263235,-0.147842,-0.331548,-0.351093,0.480019,-0.096496,-0.03583,-0.194351,0.172757,0.20373
1,1.365363,0.498665,1.305511,1.341471,-0.406539,-0.013724,0.240637,0.821449,-0.833981,-1.131215,...,1.794619,0.172372,1.763661,1.744141,-0.530514,-0.12362,-0.028181,0.991779,-0.561211,-1.008389
2,0.380066,0.06922,0.404101,0.266596,0.96752,0.356414,0.726902,0.857221,0.437094,-0.666053,...,0.629403,0.076638,0.533832,0.492044,1.000466,-0.086163,0.499625,0.57035,-0.107831,-0.206293
3,-0.486317,-0.353185,-0.42857,-0.526233,0.69429,0.533852,-0.144722,-0.533686,4.8e-05,1.147386,...,-0.698111,-0.433394,-0.524721,-0.636959,0.584109,0.065603,-0.163957,-0.620377,-0.553285,0.545322
4,-0.72981,-1.113514,-0.709283,-0.709281,0.294512,0.159898,-0.271202,-0.587608,0.025542,0.703052,...,-0.827117,-0.966535,-0.849575,-0.739243,0.128718,-0.264407,-0.453677,-0.689644,-0.913135,-0.141789


In [250]:
x_test[:3]

Unnamed: 0,num__mean radius,num__mean texture,num__mean perimeter,num__mean area,num__mean smoothness,num__mean compactness,num__mean concavity,num__mean concave points,num__mean symmetry,num__mean fractal dimension,...,num__worst radius,num__worst texture,num__worst perimeter,num__worst area,num__worst smoothness,num__worst compactness,num__worst concavity,num__worst concave points,num__worst symmetry,num__worst fractal dimension
0,-0.466497,-0.137289,-0.444211,-0.486465,0.28085,0.041606,-0.111465,-0.264869,0.415241,0.135137,...,-0.263235,-0.147842,-0.331548,-0.351093,0.480019,-0.096496,-0.03583,-0.194351,0.172757,0.20373
1,1.365363,0.498665,1.305511,1.341471,-0.406539,-0.013724,0.240637,0.821449,-0.833981,-1.131215,...,1.794619,0.172372,1.763661,1.744141,-0.530514,-0.12362,-0.028181,0.991779,-0.561211,-1.008389
2,0.380066,0.06922,0.404101,0.266596,0.96752,0.356414,0.726902,0.857221,0.437094,-0.666053,...,0.629403,0.076638,0.533832,0.492044,1.000466,-0.086163,0.499625,0.57035,-0.107831,-0.206293


In [None]:
print(f"Accuracy: {accuracy}")
print(f"classification report\n: {classification_report}")
print(f"confusion matrix\n: {confusion_matrix}")

Accuracy: 0.9649122807017544
classification report
:               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

confusion matrix
: [[40  3]
 [ 1 70]]


In [None]:
import pickle
file = open('D:\Breast_cancer project\models\model.pkl','wb')
file = open('D:\Breast_cancer project\models\preprocessor.pkl','wb')
pickle.dump(model,file)
pickle.dump(preprocessor,file)
file.close()

In [255]:
actual_value=y_test[:3]

In [256]:
predict_val=model.predict(x_test[:3])

In [258]:
print(actual_value)


204    1
70     0
131    0
Name: Diagnosis, dtype: int64


In [259]:
print(predict_val)

[1 0 0]
