# Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, roc_curve, roc_auc_score
from xgboost import XGBClassifier
try:
    import xgboost
except ImportError:
    !pip install xgboost

In [2]:
dataset = pd.read_csv('data.csv')

In [3]:
dataset.head()

In [4]:
dataset.info()

In [5]:
# drop id and empty column
dataset.drop(['Unnamed: 32', "id"], axis=1, inplace=True)

In [6]:
dataset.head()

In [7]:
dataset.describe()

In [8]:
# turn target variable into 1s and 0s
dataset.diagnosis =[1 if value == "M" else 0 for value in dataset.diagnosis]

In [9]:
dataset.head()

In [10]:
# Store the 'diagnosis' column separately
diagnosis_column = dataset['diagnosis']

# Drop the 'diagnosis' column from the DataFrame
dataset = dataset.drop(columns=['diagnosis'])

# Append the 'diagnosis' column to the DataFrame as the last column
dataset['diagnosis'] = diagnosis_column

In [11]:
dataset.head()

# Get the shape of the dataframe

In [12]:
dataset.shape

In [13]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Splitting the dataset into the Training set and Test Set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling

In [15]:
# Feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [16]:
# Train XGBoost model
classifier = XGBClassifier()
# classifier.fit(X_train, y_train)

In [17]:
# Define parameter grid
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 4, 5],
    'n_estimators': [100, 200, 300]
}

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, n_jobs=-1)

In [18]:
# Fit GridSearchCV to training data
grid_search.fit(X_train, y_train)

In [19]:
# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

In [20]:
# Use the best parameters to create the final model
# final_classifier = XGBClassifier(**best_params)
# final_classifier.fit(X_train, y_train)
# Use the best parameters to create the final model
classifier = XGBClassifier(**best_params)
classifier.fit(X_train, y_train)

In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

# Define scoring metrics
scoring = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

# Perform 10-fold cross-validation for accuracy
accuracy_scores = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)

# Perform 10-fold cross-valiation for precision
precision_scores = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10, scoring='precision')

# Perform 10-fold cross-validation for recall
recall_scores = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv=10, scoring='recall')

# Perform 10-fold cross-validation for F1-score
f1_scores = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv=10, scoring='f1')

# Print the results
print("Accuracy: {:.2f} %".format(accuracy_scores.mean() * 100))
print("Precision: {:.2f} %".format(precision_scores.mean() * 100))
print("Recall: {:.2f} %".format(recall_scores.mean() * 100))
print("F1 Score: {:.2f} %".format(f1_scores.mean() * 100))

In [22]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score, plot_confusion_matrix
accuracy = accuracy_score(y_test, y_pred) * 100
precision = precision_score(y_test, y_pred) * 100
recall = recall_score(y_test, y_pred) * 100
f1 = f1_score(y_test, y_pred) * 100
print("Accuracy: {:.2f}%".format(accuracy))
print("Precision: {:.2f}%".format(precision))
print("Recall: {:.2f}%".format(recall))
print("F1 Score: {:.2f}%".format(f1))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# Confusion Matrix
disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap=plt.cm.Blues)

In [None]:
# Classification Report
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

In [None]:
# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred)
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, marker='.')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
# Area under ROC Curve
auc_score = roc_auc_score(y_test, y_pred)
print("Area under ROC Curve:", auc_score)

In [None]:
# Feature Importance
feature_importance = classifier.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]

# Get feature names
feature_names = dataset.columns[:-1]

plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), feature_importance[sorted_idx], align="center")
plt.xticks(range(X.shape[1]), feature_names[sorted_idx], rotation=45, ha='right', fontsize=8)
plt.xlabel('Feature')
plt.ylabel('Feature Importance')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()


# ii) Support Vector Classification - Zaharah

# iii) Random Forest Classification - Emma

In [24]:
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier

# Instantiate Random Forest Classifier
rf_classifier = RandomForestClassifier()


In [25]:
# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create GridSearchCV object
rf_grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit GridSearchCV to training data
rf_grid_search.fit(X_train, y_train)


In [26]:
# Get the best parameters
rf_best_params = rf_grid_search.best_params_
print("Best Parameters for Random Forest Classifier:", rf_best_params)


In [None]:
# Use the best parameters to create the final model
rf_classifier = RandomForestClassifier(**rf_best_params)
rf_classifier.fit(X_train, y_train)

# Cross-validation for Random Forest Classifier
rf_accuracy_scores = cross_val_score(estimator=rf_classifier, X=X_train, y=y_train, cv=10)
rf_precision_scores = cross_val_score(estimator=rf_classifier, X=X_train, y=y_train, cv=10, scoring='precision')
rf_recall_scores = cross_val_score(estimator=rf_classifier, X=X_train, y=y_train, cv=10, scoring='recall')
rf_f1_scores = cross_val_score(estimator=rf_classifier, X=X_train, y=y_train, cv=10, scoring='f1')

# Print the results
print("Random Forest Classifier - Cross Validation Results:")
print("Accuracy: {:.2f} %".format(rf_accuracy_scores.mean() * 100))
print("Precision: {:.2f} %".format(rf_precision_scores.mean() * 100))
print("Recall: {:.2f} %".format(rf_recall_scores.mean() * 100))
print("F1 Score: {:.2f} %".format(rf_f1_scores.mean() * 100))


In [None]:
# Predictions and evaluation
rf_y_pred = rf_classifier.predict(X_test)


In [None]:
# Confusion Matrix
rf_conf_matrix = confusion_matrix(y_test, rf_y_pred)
print("\nConfusion Matrix:")
print(rf_conf_matrix)


In [None]:
# Classification Report
rf_class_report = classification_report(y_test, rf_y_pred)
print("\nClassification Report:")
print(rf_class_report)


In [None]:
# Precision-Recall Curve
rf_precision, rf_recall, _ = precision_recall_curve(y_test, rf_y_pred)
plt.plot(rf_recall, rf_precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Random Forest Precision-Recall Curve')
plt.show()


In [None]:
# ROC Curve
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_y_pred)
plt.plot(rf_fpr, rf_tpr, marker='.')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random Forest ROC Curve')
plt.show()


In [None]:
# Area under ROC Curve
rf_auc_score = roc_auc_score(y_test, rf_y_pred)
print("Area under ROC Curve for Random Forest Classifier:", rf_auc_score)


In [None]:
# Feature Importance
rf_feature_importance = rf_classifier.feature_importances_
rf_sorted_idx = np.argsort(rf_feature_importance)[::-1]

plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), rf_feature_importance[rf_sorted_idx], align="center")
plt.xticks(range(X.shape[1]), feature_names[rf_sorted_idx], rotation=45, ha='right', fontsize=8)
plt.xlabel('Feature')
plt.ylabel('Feature Importance')
plt.title('Random Forest Feature Importance')
plt.tight_layout()
plt.show()


# iv) Neural Network Classification - Saul

# v) Logistic Regression - Prossy