In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import joblib

In [2]:
# Load the dataset from UCI repository
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
columns = ['ID', 'Diagnosis'] + [f'feature_{i}' for i in range(1, 31)]
data = pd.read_csv(url, header=None, names=columns)
data.head()

Unnamed: 0,ID,Diagnosis,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
# Preprocess the dataset
X = data.drop(['ID', 'Diagnosis'], axis=1)
y = data['Diagnosis'].map({'M': 1, 'B': 0})  # Map 'M' to 1 and 'B' to 0 for binary classification

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
(X_train.shape, y_train.shape), (X_test.shape,  y_test.shape)

(((455, 30), (455,)), ((114, 30), (114,)))

In [5]:
# Create a StandardScaler instance and scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Save the model and scaler
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [10]:
import pandas as pd
import joblib
import numpy as np

# Function for making predictions
def predict_new_data(features, feature_names):
    # Load the model and scaler
    model = joblib.load('model.pkl')
    scaler = joblib.load('scaler.pkl')

    # Convert features to DataFrame with proper column names
    features_df = pd.DataFrame([features], columns=feature_names)

    # Scale the input features
    features_scaled = scaler.transform(features_df)

    # Make a prediction
    prediction = model.predict(features_scaled)
    probability = model.predict_proba(features_scaled)[0, 1]
    return ('Positive' if prediction[0] == 1 else 'Negative', probability)

# Test the function with sample data
sample_features = X_test.iloc[0].tolist()
feature_names = X_test.columns.tolist()  # récupère les noms de colonnes
prediction, probability = predict_new_data(sample_features, feature_names)
print(f"Prediction: {prediction}, Probability: {probability:.2f}")


Prediction: Negative, Probability: 0.11
