<a href="https://colab.research.google.com/github/Esther10203/python/blob/main/Breast_Cancel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overall Outline

1. Read Data

2. Data Preprocessing

3. Declare Model

4. Train Model

5. Predict

6. Evaluate

In [71]:
# Install required libraries
!pip install scikit-learn pandas

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, matthews_corrcoef
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical




In [72]:
# Sample data (replace this with your actual data)
data = pd.read_csv('cancer.csv')
df = pd.DataFrame(data)

data.info()
data.describe()
data.columns
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class        286 non-null    object
 1   age          286 non-null    object
 2   menopause    286 non-null    object
 3   tumor-size   286 non-null    object
 4   inv-nodes    286 non-null    object
 5   node-caps    286 non-null    object
 6   deg-malig    286 non-null    int64 
 7   breast       286 non-null    object
 8   breast-quad  286 non-null    object
 9   irradiat     286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no


In [73]:
# Encode categorical variables
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

In [74]:
# Separate features and target variable
X = df.drop('Class', axis=1)
y = df['Class']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [75]:

# Create a logistic regression model
# model = LogisticRegression()
logistic_model = LogisticRegression(max_iter=1000)

# Train the model
logistic_model.fit(X_train, y_train)


In [76]:

# Make predictions on the test set
logistic_y_pred = logistic_model.predict(X_test)

# Evaluate the Logistic Regression model
logistic_accuracy = accuracy_score(y_test, logistic_y_pred)
logistic_r2 = r2_score(y_test, logistic_y_pred)
logistic_mse = mean_squared_error(y_test, logistic_y_pred)
logistic_mcc = matthews_corrcoef(y_test, logistic_y_pred)
logistic_mr2 = logistic_mcc ** 2

In [77]:
# Create the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

In [78]:
# Make predictions with Random Forest
rf_y_pred = rf_model.predict(X_test)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)
rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_mcc = matthews_corrcoef(y_test, rf_y_pred)
rf_mr2 = rf_mcc ** 2

In [79]:
# Build and train a Neural Network
nn_model = Sequential()
nn_model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(2, activation='softmax'))

nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

<keras.src.callbacks.History at 0x7efad8a96020>

In [80]:
# Make predictions with Neural Network
nn_y_pred_prob = nn_model.predict(X_test)
nn_y_pred = nn_y_pred_prob.argmax(axis=1)
nn_mse = mean_squared_error(y_test, nn_y_pred)
nn_r2 = r2_score(y_test, nn_y_pred)
nn_mcc = matthews_corrcoef(y_test, nn_y_pred)
nn_mr2 = nn_mcc ** 2



In [81]:
# Create a DataFrame to compare models
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'Neural Network'],
    'MSE': [logistic_mse, rf_mse, nn_mse],
    'R2': [logistic_r2, rf_r2, nn_r2],
    'MR^2': [logistic_mr2, rf_mr2, nn_mr2]
})

print("\nComparison of Models:")
print(comparison_df)


Comparison of Models:
                 Model       MSE        R2      MR^2
0  Logistic Regression  0.362069 -0.567568  0.009504
1        Random Forest  0.327586 -0.418275  0.047889
2       Neural Network  0.379310 -0.642214  0.005396
