In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.ensemble import ExtraTreesClassifier
from google.colab import files

uploaded = files.upload()


Saving breast-cancer.csv to breast-cancer (1).csv


In [3]:
df = pd.read_csv("breast-cancer.csv")

print(df.head())
print(df.info())
print(df.describe())


df = df.dropna()

# Encoding the target variable: 'M' -> 1 (Malignant), 'B' -> 0 (Benign)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})


         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_worst  perimeter_worst  area_wor

In [4]:
# Separating features and target
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']


In [5]:
# Using Filter method: SelectKBest with chi2
filter_selector = SelectKBest(score_func=chi2, k=10)
X_selected_filter = filter_selector.fit_transform(X, y)
selected_features_filter = X.columns[filter_selector.get_support()]

print("Selected Features (Filter Method):", list(selected_features_filter))


Selected Features (Filter Method): ['id', 'radius_mean', 'perimeter_mean', 'area_mean', 'perimeter_se', 'area_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst']


In [6]:
# Using Wrapper method: Recursive Feature Elimination (RFE)
rfe_selector = RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=10, step=1)
rfe_selector.fit(X, y)
selected_features_wrapper = X.columns[rfe_selector.get_support()]

print("Selected Features (Wrapper Method):", list(selected_features_wrapper))


Selected Features (Wrapper Method): ['perimeter_mean', 'area_mean', 'concavity_mean', 'concave points_mean', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'concavity_worst', 'concave points_worst']


In [7]:
# Using Embedded method: Feature importance using RandomForestClassifier
model_embedded = ExtraTreesClassifier(random_state=42)
model_embedded.fit(X, y)
importances = model_embedded.feature_importances_
selected_features_embedded = X.columns[np.argsort(importances)[-10:]]  # Select top 10 features

print("Selected Features (Embedded Method):", list(selected_features_embedded))


Selected Features (Embedded Method): ['concavity_mean', 'concavity_worst', 'concave points_mean', 'area_mean', 'perimeter_mean', 'radius_worst', 'radius_mean', 'perimeter_worst', 'area_worst', 'concave points_worst']


In [8]:
X_selected = X[selected_features_filter]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [9]:
# Training Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluating accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 94.74%


In [19]:

results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results.to_csv('results.csv', index=False)
print("Results saved to results.csv")


Results saved to results.csv
