In [1]:
#importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dataset = pd.read_csv(r'E:\BreastCancerDetection\Dataset\breast cancer kaggle.csv')

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
dataset.select_dtypes(include= 'object').columns

In [None]:
len(dataset.select_dtypes(include= 'object').columns)

In [None]:
dataset.select_dtypes(include= ['float64','int64']).columns

In [None]:
#statistical summary:

dataset.describe()

In [None]:
# Dealing with missing values:

dataset.isnull().values.any()

In [None]:
dataset.isnull().values.sum()

In [None]:
dataset.columns[dataset.isnull().any()]

In [None]:
len(dataset.columns[dataset.isnull().any()])

In [None]:
dataset['Unnamed: 32'].count()

In [15]:
dataset = dataset.drop(columns='Unnamed: 32')

In [None]:
dataset.shape

In [None]:
dataset.isnull().values.any()

In [None]:
# Dealing with categorical values:

dataset.select_dtypes(include= 'object').columns

In [None]:
dataset['diagnosis'].unique()

In [20]:
#One hot Encoding:

dataset = pd.get_dummies(data = dataset, drop_first= True)

In [None]:
dataset.head()

In [None]:
# Countplot:

sns.countplot(dataset['diagnosis_M'], label ='Count')
plt.show()

In [None]:
# B(0) Values

(dataset.diagnosis_M == 0).sum()

In [None]:
# M(1) Values

(dataset.diagnosis_M == 1).sum()

In [25]:
# Correlation and Heatmap

dataset_2 = dataset.drop(columns= 'diagnosis_M')

In [None]:
dataset_2.head(5)

In [None]:
dataset_2.corrwith(dataset['diagnosis_M']).plot.bar(
    figsize=(20, 10), title='Correlation between daignosis_M', rot = 45, grid = True
)


In [28]:
# Correlation Matrix.

corr = dataset.corr()

In [None]:
corr

In [None]:
# HeatMap

plt.figure(figsize = (20,10))
sns.heatmap(corr, annot = True)

In [None]:
# Sliting the Dataset into Training and Testing:

dataset.head(3)

In [32]:
#matrix pf features / independent variation

x = dataset.iloc[:,1:-1].values

In [None]:
x.shape

In [34]:
y = dataset.iloc[:, -1].values

In [None]:
y.shape

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state= 0)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [40]:
# Feature Scaling:

from sklearn.preprocessing import StandardScaler

In [41]:
sc = StandardScaler()

In [42]:
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
x_train

In [None]:
x_test

In [45]:
# Model Building:

In [46]:
# 1) Logistic Regression:

from sklearn.linear_model import LogisticRegression

In [47]:
classifier_lr = LogisticRegression(random_state = 0)

In [None]:
classifier_lr.fit(x_train, y_train)

In [49]:
y_pred = classifier_lr.predict(x_test)

In [50]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix, recall_score

In [51]:
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [52]:
results = pd.DataFrame([['Logistic Regression', acc,f1,prec,recall]],
                       columns=['Model', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])

In [None]:
results

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [55]:
# Cross Validation:

from sklearn.model_selection import cross_val_score

In [56]:
accuracies = cross_val_score(estimator=classifier_lr, X=x_train, y=y_train, cv=10)

In [None]:
print("Accuracy is: {:.2f} %". format(accuracies.mean()*100))
print("Standard Deviation is {:.2f} %". format(accuracies.std()*100))

In [58]:
# 2) Random Forest:

from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier_rf = RandomForestClassifier(random_state= 0)
classifier_rf.fit(x_train, y_train)

In [60]:
y_pred = classifier_rf.predict(x_test)

In [61]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix, recall_score

In [62]:
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [63]:
model_results = pd.DataFrame([['Random Forest', acc,f1,prec,recall]],
                       columns=['Model', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])

In [64]:
results = pd.concat([results, model_results], ignore_index=True)

In [None]:
results

In [None]:
# confusion matrix 

cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
accuracies = cross_val_score(estimator=classifier_rf, X=x_train, y=y_train, cv=10)

print("Accuracy is: {:.2f} %". format(accuracies.mean()*100))
print("Standard Deviation is {:.2f} %". format(accuracies.std()*100))

In [68]:
# Randomized Search to find the Best Parameters(Logistic Regression)

from sklearn.model_selection import RandomizedSearchCV

In [74]:
parameters = {'penalty':['l1','l2','elasticnet','none'],
              'C':[0.25,0.5,0.75,1.0,1.25,1.5,1.75,2.0],
              'solver': ['newton-cg', 'lbfgs', 'liblinear','sag','saga']
}

In [None]:
parameters

In [76]:
random_search = RandomizedSearchCV(estimator= classifier_lr, param_distributions= parameters, n_iter= 10, scoring='roc_auc', n_jobs=-1, cv = 10, verbose= 3)

In [None]:
random_search.fit(x_train, y_train)

In [None]:
random_search.best_estimator_

In [None]:
random_search.best_score_

In [None]:
random_search.best_params_

In [None]:
# Final Model: (Logistic Regression)

from sklearn.linear_model import LogisticRegression


classifier = LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
classifier.fit(x_train, y_train)

In [None]:
# Predict on the test set
y_pred = classifier.predict(x_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Create a DataFrame for the model's results
model_results = pd.DataFrame({
    'Model': ['Final Logistic Regression'],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

# Append the model's results to the overall results DataFrame
# Note: The 'results' DataFrame should be initialized before this step
if 'results' not in locals():
    results = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    
results = pd.concat([results, model_results], ignore_index=True)

# Display the updated results
print(results)

In [None]:
# Predicting single observation.

dataset.head()

In [97]:
single_obs = [[17.99, 10.38, 122.8, 1001.0, 0.1184, 0.2776, 0.3001, 0.1471, 0.2419, 0.07871, 1.095, 0.9053, 8.589, 153.4, 0.006399, 0.04904, 0.05373, 0.01587, 0.03003, 0.006193, 25.38, 17.33, 184.6, 2019.0, 0.1622, 0.6656, 0.7119, 0.2654, 0.4601, 0.1189]]

In [None]:
single_obs

In [None]:
classifier.predict(sc.transform(single_obs))

In [None]:
prediction = classifier.predict(sc.transform(single_obs))
label_map = {0: "Malignant", 1: "Benign"}
print(label_map[prediction[0]])