# Task Description

## Implementation

The task involves implementing and evaluating three classifiers for diagnosing breast cancer using a dataset of patients tested via fine needle aspiration (FNA). The dataset contains statistics of 10 different features of multiple cell samples, along with a diagnosis (malignant or benign).

1. **Rule-based Classifier**: A rule-based classifier where abnormal cell size, shape, texture, or homogeneity indicate malignancy.

2. **Random Forest Classifier**: Applied to the supplied dataset features using the sklearn framework.

3. **Custom Classifier**: Designed to balance interpretability and classification performance.

### Rule-based Classifier
For the rule-based classifier, appropriate variables need to be defined based on medical insights and the available data. The rules are interpreted from the medical insights provided.

### Random Forest Classifier
The sklearn framework is used to implement a random forest classifier on the dataset features.

### Custom Classifier
A custom classifier is designed to balance interpretability and classification performance, building on existing models but focused on the mentioned trade-off.

## Evaluation
The classification performance of the three classifiers will be compared and the interpretability of each will be discussed. Notable interactions between features will also be explored.

---



In [127]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
from IPython.display import Image

In [128]:
class BreastCancerClassifier:
    def __init__(self):
        # Initialize classifiers
        self.random_forest_classifier = RandomForestClassifier()
        
    def fit(self, df, X_train, y_train):
        self.fit_rule_based_classifier(df)
        self.random_forest_classifier.fit(X_train, y_train)
        pass

    def predict(self, X_test):
        y_pred = self.random_forest_classifier.predict(X_test)
        return y_pred
   
    def score(self, df, X_test, y_test):
        self.fit_rule_based_scorer(df)
        y_pred = self.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred) * 100
        print(f"Accuracy Random forest: {accuracy:.2f}%")
        self.custom_scorer(df,self.custom_classifier(df, y_pred, X_test), X_test)
        

    def custom_classifier(self,df,y_predicted, X_test):
            
            # Extract the indices of X_test
            test_indices = X_test.index

            # Filter df using the indices of X_test and select all features
            rule_base_predictions = df.loc[test_indices, 'predicted_diagnosis']
        
            random_forest_predictions = y_predicted



            predictions_combined = []

            for pred_rule_based,pred_rf in zip(rule_base_predictions,random_forest_predictions):
                if pred_rf == 1:
                    predictions_combined.append(pred_rf)
                else:
                    predictions_combined.append(pred_rule_based)
            
            return predictions_combined


    def fit_rule_based_classifier(self, df):
        features = df[df.malignant == 0].drop(columns=["id", "malignant"])

       # Find the maximum value for each feature
        max_values = features.max()
        
        # Store the maximum value for each feature in a dictionary
        thresholds = max_values.to_dict()
        

        # Define a function to classify based on thresholds
        def classify(row):
            for feature, threshold in thresholds.items():
                if row[feature] > threshold:
                    return 1  # malignant
            return 0  # benign
        
        df["predicted_diagnosis"] = df.apply(classify, axis=1)

              

    def fit_rule_based_scorer(self, X_test):
        correct_predictions = (X_test['malignant'] == X_test['predicted_diagnosis']).sum()
        total_predictions = len(X_test)
        accuracy_percentage = (correct_predictions / total_predictions) * 100

        print(f"Accuracy Rule based: {accuracy_percentage:.2f}%")

    def custom_scorer(self, df, predicted, X_test):
        loop = X_test.index
        loopdf = df.loc[loop, 'malignant']
        correct_predictions = (loopdf == predicted).sum()
        total_predictions = len(X_test)
        accuracy_percentage = (correct_predictions / total_predictions) * 100

        print(f"Accuracy Rule based: {accuracy_percentage:.2f}%")      

   
        

In [129]:
# Loading data
df = pd.read_pickle('dataset/wdbc.pkl')
X = df.drop(columns=["id", "malignant"])
y = df.malignant
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  


# Training and scoring
classifier = BreastCancerClassifier()
classifier.fit(df, X_train, y_train)
classifier.score(df, X_test, y_test)

Accuracy Rule based: 93.32%
Accuracy Random forest: 96.49%
Accuracy Rule based: 96.49%


In [130]:


# Assuming classifier is an instance of your BreastCancerClassifier class

for i, tree in enumerate(classifier.random_forest_classifier.estimators_[:1]):
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,  
                               filled=True,  
                               max_depth=10, 
                               impurity=False, 
                               proportion=True)
    graph = graphviz.Source(dot_data)
    # Display or save the tree visualization as needed
    graph.view()


In [131]:
""""
fig, axs = plt.subplots(10, 3, figsize=(12, 12))
axs = axs.flatten()

# Limit the number of features to the length of axs array
num_features = min(len(features.columns), len(axs))

for i, feature in enumerate(features.columns[:num_features]):
    axs[i].hist(benign[feature], bins=20, alpha=0.5, color='blue', label='Benign')
    axs[i].hist(malignant[feature], bins=20, alpha=0.5, color='red', label='Malignant')
    axs[i].set_title(feature)
    axs[i].legend()

plt.tight_layout()
plt.show()
"""

'"\nfig, axs = plt.subplots(10, 3, figsize=(12, 12))\naxs = axs.flatten()\n\n# Limit the number of features to the length of axs array\nnum_features = min(len(features.columns), len(axs))\n\nfor i, feature in enumerate(features.columns[:num_features]):\n    axs[i].hist(benign[feature], bins=20, alpha=0.5, color=\'blue\', label=\'Benign\')\n    axs[i].hist(malignant[feature], bins=20, alpha=0.5, color=\'red\', label=\'Malignant\')\n    axs[i].set_title(feature)\n    axs[i].legend()\n\nplt.tight_layout()\nplt.show()\n'

In [132]:
#print(df['index'] == 70)
df.loc[70]

id                     859575.000000
malignant                   1.000000
radius_0                   18.940000
texture_0                  21.310000
perimeter_0               123.600000
area_0                   1130.000000
smoothness_0                0.090090
compactness_0               0.102900
concavity_0                 0.108000
concave points_0            0.079510
symmetry_0                  0.158200
fractal dimension_0         0.054610
radius_1                    0.788800
texture_1                   0.797500
perimeter_1                 5.486000
area_1                     96.050000
smoothness_1                0.004444
compactness_1               0.016520
concavity_1                 0.022690
concave points_1            0.013700
symmetry_1                  0.013860
fractal dimension_1         0.001698
radius_2                   24.860000
texture_2                  26.580000
perimeter_2               165.900000
area_2                   1866.000000
smoothness_2                0.119300
c