<a href="https://colab.research.google.com/github/EricaAnnor/breast_cancer_prediction/blob/main/Breast_Cancer_predicition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import modules


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, accuracy_score


## Read datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
new_data = pd.read_csv('/content/drive/MyDrive/Breast_Cancer prediction/breasts.csv')


In [None]:
data = new_data.copy()
data.head()

In [None]:
data['Diagnosis'] = data['Diagnosis'].map({'M':1,'B':0})


## Spliting our data into X and Y values


In [None]:
Y = data['Diagnosis']
X = data.drop(['ID','Diagnosis'],axis=1)

In [None]:
Y.head()

In [None]:
X.head()

## Data Preprocessing

In [None]:
data_ = new_data.copy()

In [None]:
X_ = data_.drop(['ID','Diagnosis'],axis=1)

## Standardization


In [None]:
scaler = StandardScaler()
z_scores = scaler.fit_transform(X_)


In [None]:
z_scores

In [None]:
print(z_scores)

## Feature selection using the filtering method


In [None]:
Y


In [None]:
# Convert the scaled array back to a DataFrame
X_scaled_df = pd.DataFrame(z_scores, columns=X_.columns)

X_scaled_df['target'] = Y

In [None]:
X_scaled_df.head()

In [None]:
correlation_matrix =X_scaled_df.corr(method='pearson')  # Calculate correlation matrix
correlation_matrix

In [None]:
target_correlation = correlation_matrix['target']

In [None]:
threshold = 0.5  # Set your correlation threshold
relevant_features = target_correlation[abs(target_correlation) > threshold].index.tolist()

# Use relevant_features in your model
selected_data = X_scaled_df[relevant_features]

In [None]:
relevant_features

In [None]:
selected_data.drop('target',axis=1)

## Model Construction

In [None]:
# Splitting the data into 70:30 ratio
X_train_70, X_test_70, y_train_70, y_test_70 = train_test_split(selected_data, Y, test_size=0.3, random_state=42)

# Splitting the data into 80:20 ratio
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(selected_data, Y, test_size=0.2, random_state=42)

### Initialising models

In [None]:
random_forest = RandomForestClassifier(random_state=42)
xgboost_model = XGBClassifier(random_state=42)
logistic_regression = LogisticRegression(random_state=42)
knn_model = KNeighborsClassifier()

In [None]:
# Training models with 70:30 split
random_forest.fit(X_train_70, y_train_70)
xgboost_model.fit(X_train_70, y_train_70)
logistic_regression.fit(X_train_70, y_train_70)
knn_model.fit(X_train_70, y_train_70)


### Evaluation(70:30)

In [None]:
# Evaluating models with 70:30 split
rf_pred_70 = random_forest.predict(X_test_70)
xgb_pred_70 = xgboost_model.predict(X_test_70)
lr_pred_70 = logistic_regression.predict(X_test_70)
knn_pred_70 = knn_model.predict(X_test_70)

# Calculating evaluation metrics for 70:30 split
print("\nEvaluation Metrics with 70:30 split:")
print("Random Forest:")
print(classification_report(y_test_70, rf_pred_70))
print("Accuracy:", accuracy_score(y_test_70, rf_pred_70))

print("\nXGBoost:")
print(classification_report(y_test_70, xgb_pred_70))
print("Accuracy:", accuracy_score(y_test_70, xgb_pred_70))

print("\nLogistic Regression:")
print(classification_report(y_test_70, lr_pred_70))
print("Accuracy:", accuracy_score(y_test_70, lr_pred_70))

print("\nK-Nearest Neighbors:")
print(classification_report(y_test_70, knn_pred_70))
print("Accuracy:", accuracy_score(y_test_70, knn_pred_70))

### Training on 80:20 dataset

In [None]:
random_forest.fit(X_train_80, y_train_80)
xgboost_model.fit(X_train_80, y_train_80)
logistic_regression.fit(X_train_80, y_train_80)
knn_model.fit(X_train_80, y_train_80)

### Evaluation for 80:20 split

In [None]:
rf_pred_80 = random_forest.predict(X_test_80)
xgb_pred_80 = xgboost_model.predict(X_test_80)
lr_pred_80 = logistic_regression.predict(X_test_80)
knn_pred_80 = knn_model.predict(X_test_80)

# Calculating evaluation metrics for 80:20 split
print("\nEvaluation Metrics with 80:20 split:")
print("Random Forest:")
print(classification_report(y_test_80, rf_pred_80))
print("Accuracy:", accuracy_score(y_test_80, rf_pred_80))

print("\nXGBoost:")
print(classification_report(y_test_80, xgb_pred_80))
print("Accuracy:", accuracy_score(y_test_80, xgb_pred_80))

print("\nLogistic Regression:")
print(classification_report(y_test_80, lr_pred_80))
print("Accuracy:", accuracy_score(y_test_80, lr_pred_80))

print("\nK-Nearest Neighbors:")
print(classification_report(y_test_80, knn_pred_80))
print("Accuracy:", accuracy_score(y_test_80, knn_pred_80))