# Loading Libraries and Dataset

In [38]:
# Installing and loading libraries
import pandas as pd
import numpy as np

# Random forest classifier
from sklearn.ensemble import RandomForestClassifier

# Train test split
from sklearn.model_selection import train_test_split

# Evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [39]:
# Loading the preprocessed CSV dataset
df = pd.read_csv("Dataset/PreprocessedDataset.csv")

# Train Test Split

In [40]:
# Selecting the good featuers from feature selection
good_features = ["biopsies", "histologicalclass", "consumed_alcohol", "menopause", "is_sad"]

In [41]:
# Assigning the X and Y
X_normal = df.drop(columns = ["cancer"])
X_good = df[good_features]
y = df["cancer"]

In [42]:
# Creating the train test split for the experiment that's using the normal features
X_normal_train, X_normal_test, y_normal_train, y_normal_test = train_test_split(X_normal, y, test_size = 0.3, random_state = 42)

In [43]:
# Creating the train test split for the experiment that's using the good features
X_good_train, X_good_test, y_good_train, y_good_test = train_test_split(X_good, y, test_size = 0.3, random_state = 42)

# Model

## All Features

In [44]:
# Initializing and fitting the model
model1 = RandomForestClassifier(n_estimators=100, random_state=42)
model1.fit(X_normal_train, y_normal_train)

In [45]:
# Making predictions using the model
y_pred = model1.predict(X_normal_test)

In [46]:
# Evaluating the model
accuracy = accuracy_score(y_normal_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

conf_matrix = confusion_matrix(y_normal_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_normal_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 1.00
Confusion Matrix:
[[165   0]
 [  0 341]]
Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       165
        True       1.00      1.00      1.00       341

    accuracy                           1.00       506
   macro avg       1.00      1.00      1.00       506
weighted avg       1.00      1.00      1.00       506



## Good Features

In [47]:
# Initializing and fitting the model
model2 = RandomForestClassifier(n_estimators=100, random_state=42)
model2.fit(X_good_train, y_good_train)

In [48]:
# Making predictions using the model
y_pred = model2.predict(X_good_test)

In [49]:
# Evaluating the model
accuracy = accuracy_score(y_good_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

conf_matrix = confusion_matrix(y_good_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_good_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.99
Confusion Matrix:
[[165   0]
 [  6 335]]
Classification Report:
              precision    recall  f1-score   support

       False       0.96      1.00      0.98       165
        True       1.00      0.98      0.99       341

    accuracy                           0.99       506
   macro avg       0.98      0.99      0.99       506
weighted avg       0.99      0.99      0.99       506

