In [None]:
#%pip install -q TPOT scikit-learn pandas pycaret pycaret[models]

In [64]:
# Step 1: Import necessary libraries
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tpot import TPOTClassifier

# Step 2: Load the Breast Cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Step 3: Data Preprocessing
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: AutoML with TPOT
# Initialize TPOTClassifier
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, random_state=42)

# Fit TPOT on the training data
tpot.fit(X_train_scaled, y_train)

# Step 5: Evaluate the Best Pipeline
# Evaluate the performance of the best pipeline on the test set
accuracy = tpot.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9736263736263737

Generation 2 - Current best internal CV score: 0.9736263736263737

Generation 3 - Current best internal CV score: 0.9758241758241759

Generation 4 - Current best internal CV score: 0.9780219780219781

Generation 5 - Current best internal CV score: 0.9802197802197803

Best pipeline: MLPClassifier(RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.5, min_samples_leaf=14, min_samples_split=8, n_estimators=100), alpha=0.001, learning_rate_init=0.001)
Accuracy: 0.9649122807017544


In [65]:
prediction = tpot.predict(X_test_scaled)
actual = y_test[0]
print(f"Prediction: {prediction[0]}")
print(f"Actual: {actual}")

Prediction: 1
Actual: 1


In [66]:
import pandas as pd
from pycaret.classification import *

# Load the Breast Cancer dataset
from sklearn.datasets import load_iris
data = load_iris()
X, y = data.data, data.target

# Convert X and y to a pandas DataFrame
df = pd.DataFrame(data=X, columns=data.feature_names)
df['target'] = y

# Create a PyCaret experiment
exp1 = setup(data=df)

# Compare models and tune hyperparameters
best_model = compare_models()

# Evaluate the best model
evaluate_model(best_model)

# Finalize the best model using the entire dataset
final_model = finalize_model(best_model)

# Save the model
save_model(final_model, 'iris_classification_model')


Unnamed: 0,Description,Value
0,Session id,6387
1,Target,target
2,Target type,Multiclass
3,Original data shape,"(150, 5)"
4,Transformed data shape,"(150, 5)"
5,Transformed train set shape,"(105, 5)"
6,Transformed test set shape,"(45, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.9727,0.9795,0.9727,0.9745,0.9723,0.9586,0.9597,0.008
lda,Linear Discriminant Analysis,0.9727,1.0,0.9727,0.9818,0.9716,0.9592,0.9642,0.008
qda,Quadratic Discriminant Analysis,0.9718,1.0,0.9718,0.9784,0.9714,0.9576,0.9611,0.008
knn,K Neighbors Classifier,0.9618,0.988,0.9618,0.97,0.9606,0.9418,0.9468,0.246
lr,Logistic Regression,0.9536,0.9976,0.9536,0.9597,0.9528,0.9296,0.9332,0.497
rf,Random Forest Classifier,0.9536,0.9951,0.9536,0.9597,0.9528,0.9296,0.9332,0.042
lightgbm,Light Gradient Boosting Machine,0.9536,0.9857,0.9536,0.967,0.9521,0.9304,0.9377,0.057
gbc,Gradient Boosting Classifier,0.9527,0.9784,0.9527,0.959,0.9519,0.9283,0.9321,0.054
et,Extra Trees Classifier,0.9436,0.9951,0.9436,0.9522,0.9428,0.9147,0.9196,0.036
nb,Naive Bayes,0.9345,0.992,0.9345,0.9481,0.9329,0.9014,0.909,0.008


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['sepal length (cm)',
                                              'sepal width (cm)',
                                              'petal length (cm)',
                                              'petal width (cm)'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWr...
                                 

In [67]:
from pycaret.classification import load_model, predict_model
from sklearn.metrics import accuracy_score

# Load the saved model
model = load_model('iris_classification_model')

# Perform inference on new data
new_data = pd.DataFrame({
    'sepal length (cm)': [5.1, 6.2, 7.3],
    'sepal width (cm)': [3.5, 2.9, 3.3],
    'petal length (cm)': [1.4, 4.5, 6.3],
    'petal width (cm)': [0.2, 1.4, 2.4],
    'actual_label': [0, 1, 2]  # Assuming these are the true labels
})

predictions = predict_model(model, data=new_data)
print(predictions)

# Calculate accuracy
accuracy = accuracy_score(new_data['actual_label'], predictions['prediction_label'])

print("Accuracy:", accuracy)

Transformation Pipeline and Model Successfully Loaded


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                6.2               2.9                4.5               1.4   
2                7.3               3.3                6.3               2.4   

   actual_label  prediction_label  prediction_score  
0             0                 0               1.0  
1             1                 1               1.0  
2             2                 2               1.0  
Accuracy: 1.0
