In [127]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split


In [128]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

dataset = pd.read_csv(os.path.join(dirname, filename))
dataset.head()

In [129]:
dataset = dataset.drop(["Unnamed: 32"], axis=1)
dataset.shape

In [130]:
dataset["diagnosis"].replace({"M":2, "B":1}, inplace=True)
dataset.head()

In [131]:
datasetCorr = dataset.corr()

### Heatmap to show the correlation matrix of the features

In [132]:
plt.subplots(figsize=(30, 30))
ax = sns.heatmap(
    datasetCorr,
    vmin = -1.0, vmax = 1.0, center=0,
    cmap = sns.diverging_palette(20, 220),
    annot = True,
    square = True
)

ax.set_xticklabels(
    ax.get_xticklabels(),
    horizontalalignment='right'
)

### Feature selection using correlation

In [133]:
corr_target = abs(datasetCorr["diagnosis"])
useful_features = corr_target[corr_target>=0.7]

useful_features = useful_features.keys()


map1 = dataset[useful_features]
mapcorr = map1.corr()

plt.subplots(figsize=(14, 14))
ax = sns.heatmap(
    mapcorr,
    vmin = -1.0, vmax = 1.0, center=0,
    cmap = sns.diverging_palette(20, 220),
    annot = True,
    square = True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    horizontalalignment='right'
)

plt.title("0.3 Threshold")


plt.savefig("0.7 thresho1ld.png")

useful_features = useful_features.delete(0)
useful_features = useful_features.tolist()
useful_features

In [134]:
X = dataset[useful_features]
X.head()

In [135]:
y = dataset["diagnosis"]
y.head()

### Splitting the dataset into train (70%) and test (30%) dataset.


In [136]:


# 70% - 30% Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

algo = [ "Random Forest Classifier", "Logistic Regression", "AdaBoost Classifier", "Gradient Boosting", "Bagging Classifier"]

accuracy=[]
precision = []
recall = []
f1_score = []

## training and testing models
1. Random Forest Classifier
2. Logistic Regression 
3. AdaBoost Classifier
4. Gradient Boosting
5. Bagging Classifier


### 1) Random Forest Classifier

In [137]:


# Calling the model on the dataset
randomForestModel = RandomForestClassifier()
randomForestModel.fit(X_train, y_train)
y_predict = randomForestModel.predict(X_test)


# Calculating Evaluation Metrics
randomForestAccuracy = accuracy_score(y_test, y_predict) * 100
randomForestConfusion = confusion_matrix(y_test, y_predict)
randomForestPrecision = randomForestConfusion[0][0]/(randomForestConfusion[0][0] + randomForestConfusion[1][0]) * 100
randomForestRecall = randomForestConfusion[0][0]/(randomForestConfusion[0][0] + randomForestConfusion[0][1]) * 100
randomForestf1Score = ((2 * randomForestPrecision * randomForestRecall) / (randomForestPrecision + randomForestRecall)) / 100


# Storing all the metrics in values for Model in common lists
accuracy.append(round(randomForestAccuracy, 2))
precision.append(round(randomForestPrecision, 2))
recall.append(round(randomForestRecall, 2))
f1_score.append(round(randomForestf1Score, 4))

In [138]:


# printing evaluation metrics
print("Results: ")
print("Accuracy:", randomForestAccuracy)
print("Precision:", randomForestPrecision)
print("Recall:", randomForestRecall)
print("F1 Score:", randomForestf1Score)

### 2) Logistic Regression

In [139]:


logisticRefressionModel = LogisticRegression(solver = 'liblinear', max_iter=300, random_state=42)
logisticRefressionModel.fit(X_train, y_train)
y_pred_lr = logisticRefressionModel.predict(X_test)



# Calculating Evaluation Metrics
logisticRegressionAccuracy = accuracy_score(y_test, y_pred_lr) * 100
logisticRegressionConfusion = confusion_matrix(y_test, y_pred_lr)
logisticRegressionPrecision = logisticRegressionConfusion[0][0]/(logisticRegressionConfusion[0][0] + logisticRegressionConfusion[1][0]) * 100
logisticRegressionRecall = lr_confusion[0][0]/(logisticRegressionConfusion[0][0] + logisticRegressionConfusion[0][1]) * 100
logisticRegressionF1 = ((2 * logisticRegressionPrecision * logisticRegressionRecall) / (logisticRegressionPrecision + logisticRegressionRecall)) / 100



# Storing all the metrics in values in common lists
accuracy.append(round(logisticRegressionAccuracy, 2))
precision.append(round(logisticRegressionPrecision, 2))
recall.append(round(logisticRegressionRecall, 2))
f1_score.append(round(logisticRegressionF1, 4))

In [140]:


# Evaluation Metrics
print("Results: ")
print("Accuracy:", logisticRegressionAccuracy)
print("Precision:", logisticRegressionPrecision)
print("Recall:", logisticRegressionRecall)
print("F1 Score:", logisticRegressionF1)

### 3) AdaBoost Classifier

In [141]:


adaboostModel = AdaBoostClassifier(n_estimators=500, learning_rate=0.1, random_state=42)
adaboostModel.fit(X_train, y_train)
y_pred_ada = adaboostModel.predict(X_test)



# Calculating Evaluation Metrics
adaboostAccuracy = accuracy_score(y_test, y_pred_ada) * 100
adaboostConfusion = confusion_matrix(y_test, y_pred_ada)
adaboostPrecision = adaboostConfusion[0][0]/(adaboostConfusion[0][0] + adaboostConfusion[1][0]) * 100
adaboostRecall = adaboostConfusion[0][0]/(adaboostConfusion[0][0] + adaboostConfusion[0][1]) * 100
adaboostF1 = ((2 * adaboostPrecision * adaboostRecall) / (adaboostPrecision + adaboostRecall)) / 100



# Storing all the metrics in common lists
accuracy.append(round(adaboostAccuracy, 2))
precision.append(round(adaboostPrecision, 2))
recall.append(round(adaboostRecall, 2))
f1_score.append(round(adaboostF1, 4))

In [142]:


# Evaluation Metrics
print("Results: ")
print("Accuracy:", adaboostAccuracy)
print("Precision:", adaboostPrecision)
print("Recall:", adaboostRecall)
print("F1 Score:", adaboostF1)

### 4) Gradient Boosting Classifier

In [143]:

gradientBoostModel = GradientBoostingClassifier()
gradientBoostModel.fit(X_train, y_train)
y_pred_gb = gradientBoostModel.predict(X_test)



# Calculating Evaluation Metrics
gradientBoostAccuracy = accuracy_score(y_test, y_pred_gb) * 100
gradientBoostConfusion = confusion_matrix(y_test, y_pred_gb)
gradientBoostPrecision = gradientBoostConfusion[0][0]/(gradientBoostConfusion[0][0] + gradientBoostConfusion[1][0]) * 100
gradientBoostRecall = gradientBoostConfusion[0][0]/(gradientBoostConfusion[0][0] + gradientBoostConfusion[0][1]) * 100
gradientBoostF1 = ((2 * gradientBoostPrecision * gradientBoostRecall) / (gradientBoostPrecision + gradientBoostRecall)) / 100



# Storing all the metrics in values in common lists
accuracy.append(round(gradientBoostAccuracy, 2))
precision.append(round(gradientBoostPrecision, 2))
recall.append(round(gradientBoostRecall, 2))
f1_score.append(round(gradientBoostF1, 4))

In [144]:


# Evaluation Metrics
print("Results:")
print("Accuracy:", gradientBoostAccuracy)
print("Precision:", gradientBoostPrecision)
print("Recall:", gradientBoostRecall)
print("F1 Score:", gradientBoostF1)

### 5) Bagging Classifier

In [145]:

baggingClassifierModel = BaggingClassifier(n_estimators=200, random_state=42)
baggingClassifierModel.fit(X_train, y_train)
y_pred_bc = baggingClassifierModel.predict(X_test)




# Calculating Evaluation Metrics
baggingClassifierAccuracy = accuracy_score(y_test, y_pred_bc) * 100
baggingClassifierConfusion = confusion_matrix(y_test, y_pred_bc)
baggingClassifierPrecision = baggingClassifierConfusion[0][0]/(baggingClassifierConfusion[0][0] + baggingClassifierConfusion[1][0]) * 100
baggingClassifierRecall = baggingClassifierConfusion[0][0]/(baggingClassifierConfusion[0][0] + baggingClassifierConfusion[0][1]) * 100
baggingClassifierF1 = ((2 * baggingClassifierPrecision * baggingClassifierRecall) / (baggingClassifierPrecision + baggingClassifierRecall)) / 100



# Storing all the metrics in values in common lists
accuracy.append(round(baggingClassifierAccuracy, 2))
precision.append(round(baggingClassifierPrecision, 2))
recall.append(round(baggingClassifierRecall, 2))
f1_score.append(round(baggingClassifierF1, 4))

In [146]:


# Printing evaluation metrics
print("Results: ")
print("Accuracy:", baggingClassifierAccuracy)
print("Precision:", baggingClassifierPrecision)
print("Recall:", baggingClassifierRecall)
print("F1 Score:", baggingClassifierF1)

### Perfomamce comparison

In [147]:
metrics = pd.DataFrame({
    'Alogrithms':algo,
    'Accuracy':accuracy,
    'Precision':precision,
    'Recall':recall,
    'F1 Score':f1_score
})

In [148]:
metrics