In [343]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import mlflow

In [344]:
data = load_breast_cancer()
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("cancer dataset check")

2024/07/02 15:35:08 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/07/02 15:35:08 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/workspaces/zoom_camp_Project/mlruns/1', creation_time=1719934510976, experiment_id='1', last_update_time=1719934510976, lifecycle_stage='active', name='cancer dataset check', tags={}>

In [345]:
df = pd.DataFrame(data.data, columns=data.feature_names)
df = pd.concat([df, df], ignore_index=True)

In [193]:
removed_col = ['target','mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension']

## ----------------------------------
## Clean Data

In [203]:
# Convert to a pandas DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Features and target
X = df.drop(columns=  removed_col )
Y = df['target']

## Spliting Data To Train and  Test 

In [305]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X , Y, random_state = 412 , test_size=0.5)

# Training the model

In [335]:
model = LogisticRegression(solver = "saga", penalty = "l1")


In [336]:
model.fit(X_train, Y_train)



# Predict the result

In [337]:
predicted_result = model.predict(X_test)

In [338]:
accuracy_score(predicted_result, Y_test)

0.8807017543859649

## Using XG boost

In [346]:
space = {
    'C': hp.loguniform('C', np.log(1e-4), np.log(1e2)),
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'solver': hp.choice('solver', ['liblinear', 'saga'])
}




def objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        model = LogisticRegression(
            C=params['C'],
            penalty=params['penalty'],
            max_iter=10000,
            solver=params['solver'],
            random_state=42
        )
        
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        accuracy = accuracy_score(Y_test, Y_pred)
        print(accuracy)
        
        return {'loss': 1 - accuracy, 'status': STATUS_OK}

In [340]:
trials = Trials()
best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials
)

print("Best Hyperparameters:")
print(best_params)

0.8912280701754386                                                                                                                                                   
0.8842105263157894                                                                                                                                                   
0.8912280701754386                                                                                                                                                   
0.9263157894736842                                                                                                                                                   
0.8736842105263158                                                                                                                                                   
0.9263157894736842                                                                                                                                                   
0.92

In [341]:
penalty_map = {0: 'l1', 1: 'l2'}
solver_map = {0: 'liblinear', 1: 'saga'}
best_penalty = penalty_map[best_params['penalty']]
best_solver = solver_map[best_params['solver']]

print(f"Best Penalty: {best_solver}")
best_solver

Best Penalty: liblinear


'liblinear'

In [342]:
best_model = LogisticRegression(
    C=111111,
    penalty=['l1', 'l2'][best_params['penalty']],
    max_iter=100000,
    solver='liblinear'
)
best_model.fit(X_train, Y_train)

# Evaluate the best model
Y_pred = best_model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
final_loss = 1 - accuracy
print(f"Final Model Accuracy: {accuracy}")
print(f"Final Loss: {final_loss}")

Final Model Accuracy: 0.9298245614035088
Final Loss: 0.07017543859649122
