In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
names = ["id_number", "diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean", 
         "smoothness_mean", "compactness_mean", "concavity_mean", "concave_points_mean", 
         "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", 
         "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave_points_se", 
         "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", 
         "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", 
         "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"]
data = pd.read_csv(url, names=names)

# Preprocessing
# Drop id_number column as it's not relevant for prediction
data.drop("id_number", axis=1, inplace=True)

# Encode diagnosis (M: Malignant, B: Benign) to numerical values (0: Benign, 1: Malignant)
data["diagnosis"] = data["diagnosis"].map({"M": 1, "B": 0})

# Separate features (X) and target (y)
X = data.drop("diagnosis", axis=1)
y = data["diagnosis"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build and train Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_train = svm_model.predict(X_train_scaled)
y_pred_test = svm_model.predict(X_test_scaled)

# Model evaluation
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))


Training Accuracy: 0.9868131868131869
Testing Accuracy: 0.956140350877193

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96        71
           1       0.93      0.95      0.94        43

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114



In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

def get_model(data, model_name, model_params):
    """
    Train a machine learning model based on the specified model name and its parameters.

    Parameters:
        data (DataFrame): Input data for training the model.
        model_name (str): Name of the machine learning model.
        model_params (dict): Dictionary containing parameters for the model.

    Returns:
        model: Trained machine learning model object.
    """

    # Separate features and target
    X = data.drop("diagnosis", axis=1)
    y = data["diagnosis"]

    # Feature selection (if needed)
    if "feature_selection" in model_params:
        num_features = model_params.pop("feature_selection")  # Remove 'feature_selection' from model_params
        selector = SelectKBest(f_classif, k=num_features)
        X = selector.fit_transform(X, y)

    # Preprocessing: Feature scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize model
    if model_name == "SVM":
        model = SVC(**model_params)
    elif model_name == "RandomForest":
        model = RandomForestClassifier(**model_params)
    elif model_name == "LogisticRegression":
        model = LogisticRegression(**model_params)
    else:
        raise ValueError("Invalid model name. Please choose from 'SVM', 'RandomForest', or 'LogisticRegression'.")

    # Train the model
    model.fit(X_scaled, y)

    return model

# Example usage:
# Define model parameters
model_params = {
    "kernel": "linear",
    "C": 1.0,
    "gamma": "scale"
    # Remove 'feature_selection' key as it's not supported by SVC
}

# Call the function to get the model object
trained_model = get_model(data, "SVM", model_params)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

def get_model(data, model_name, model_params):
    """
    Train a machine learning model based on the specified model name and its parameters.

    Parameters:
        data (DataFrame): Input data for training the model.
        model_name (str): Name of the machine learning model.
        model_params (dict): Dictionary containing parameters for the model.

    Returns:
        model: Trained machine learning model object.
    """

    # Separate features and target
    X = data.drop("diagnosis", axis=1)
    y = data["diagnosis"]

    # Feature selection (if needed)
    if "feature_selection" in model_params:
        num_features = model_params.pop("feature_selection")  # Remove 'feature_selection' from model_params
        selector = SelectKBest(f_classif, k=num_features)
        X = selector.fit_transform(X, y)

    print("Shape of X after feature selection:", X.shape)

    # Preprocessing: Feature scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    print("Shape of X_scaled after feature scaling:", X_scaled.shape)

    # Initialize model
    if model_name == "SVM":
        model = SVC(**model_params)
    elif model_name == "RandomForest":
        model = RandomForestClassifier(**model_params)
    elif model_name == "LogisticRegression":
        model = LogisticRegression(**model_params)
    else:
        raise ValueError("Invalid model name. Please choose from 'SVM', 'RandomForest', or 'LogisticRegression'.")

    # Train the model
    model.fit(X_scaled, y)

    return model

In [16]:
# Example usage:
# Define model parameters
model_params = {
    "kernel": "linear",
    "C": 1.0,
    "gamma": "scale"
    # Remove 'feature_selection' key as it's not supported by SVC
}

# Call the function to get the model object
trained_model = get_model(data, "SVM", model_params)

print(trained_model)

Shape of X after feature selection: (569, 30)
Shape of X_scaled after feature scaling: (569, 30)
SVC(kernel='linear')


In [17]:
from sklearn.model_selection import cross_val_score

# Define data and target again if needed
X = data.drop("diagnosis", axis=1)
y = data["diagnosis"]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform cross-validation
cv_scores = cross_val_score(trained_model, X_scaled, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())


Cross-validation scores: [0.95614035 0.98245614 0.96491228 0.96491228 0.98230088]
Mean CV score: 0.9701443875174661
