In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [None]:
# Separar as features (X) e a variável alvo (y)
X = df.drop("diagnosis", axis=1)
y = df["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create an instance of the StandardScaler
scaler = StandardScaler()

# Create an instance of the SimpleImputer to handle missing values
imputer = SimpleImputer()

# Fit and transform the imputer on the training data
X_train_imputed = imputer.fit_transform(X_train)

# Transform the imputer on the test data
X_test_imputed = imputer.transform(X_test)

# Scale the features using the StandardScaler
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [None]:
# Definir os hiperparâmetros para o GridSearch
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
}

param_grid_ada = {
    'learning_rate': [0.1, 0.05, 0.01, 0.5],
    'n_estimators': [50, 100, 200]
}

# GridSearchCV para RandomForest
rf_model = RandomForestClassifier(random_state=42)
rf_grid_search = GridSearchCV(rf_model, param_grid_rf, cv=5)
rf_grid_search.fit(X_train_scaled, y_train)

# GridSearchCV para AdaBoost
ada_model = AdaBoostClassifier(random_state=42)
ada_grid_search = GridSearchCV(ada_model, param_grid_ada, cv=5)
ada_grid_search.fit(X_train_scaled, y_train)

# Avaliar o desempenho dos modelos
rf_predictions = rf_grid_search.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)

ada_predictions = ada_grid_search.predict(X_test_scaled)
ada_accuracy = accuracy_score(y_test, ada_predictions)
print("AdaBoost Accuracy:", ada_accuracy)

Random Forest Accuracy: 0.9649122807017544
AdaBoost Accuracy: 0.9649122807017544


In [None]:
# Retrain the best Random Forest model with the best parameters using the entire training dataset
best_rf_model = RandomForestClassifier(n_estimators=rf_grid_search.best_params_['n_estimators'],
                                       max_depth=rf_grid_search.best_params_['max_depth'],
                                       random_state=42)
best_rf_model.fit(X_train_scaled, y_train)

# Obtain predictions from the best Random Forest model
rf_predictions = best_rf_model.predict(X_test_scaled)

# Calculate the metrics for the best Random Forest model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions, average=None)
rf_precision = precision_score(y_test, rf_predictions, average=None)
rf_f1 = f1_score(y_test, rf_predictions, average=None)

# Print the results for the best Random Forest model
print("Random Forest - Accuracy:", rf_accuracy)
print("Random Forest - Recall:", rf_recall)
print("Random Forest - Precision:", rf_precision)
print("Random Forest - F1-score:", rf_f1)

# Retrain the best AdaBoost model with the best parameters using the entire training dataset
best_ada_model = AdaBoostClassifier(learning_rate=ada_grid_search.best_params_['learning_rate'],
                                    n_estimators=ada_grid_search.best_params_['n_estimators'],
                                    random_state=42)
best_ada_model.fit(X_train_scaled, y_train)

# Obtain predictions from the best AdaBoost model
ada_predictions = best_ada_model.predict(X_test_scaled)

# Calculate the metrics for the best AdaBoost model
ada_accuracy = accuracy_score(y_test, ada_predictions)
ada_recall = recall_score(y_test, ada_predictions, average=None)
ada_precision = precision_score(y_test, ada_predictions, average=None)
ada_f1 = f1_score(y_test, ada_predictions, average=None)

# Print the results for the best AdaBoost model
print("AdaBoost - Accuracy:", ada_accuracy)
print("AdaBoost - Recall:", ada_recall)
print("AdaBoost - Precision:", ada_precision)
print("AdaBoost - F1-score:", ada_f1)


Random Forest - Accuracy: 0.9649122807017544
Random Forest - Recall: [0.98591549 0.93023256]
Random Forest - Precision: [0.95890411 0.97560976]
Random Forest - F1-score: [0.97222222 0.95238095]
AdaBoost - Accuracy: 0.9649122807017544
AdaBoost - Recall: [0.97183099 0.95348837]
AdaBoost - Precision: [0.97183099 0.95348837]
AdaBoost - F1-score: [0.97183099 0.95348837]
