In [None]:
import os
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report
import joblib


def load_data(file_path):
    """
    Load dataset from CSV file

    Args:
        file_path: Path to the CSV file

    Returns:
        pandas DataFrame containing the dataset
    """
    print(f"Loading data from {file_path}...")
    try:
        data = pd.read_csv(file_path)
        print(f"Dataset shape: {data.shape}")
        return data
    except Exception as e:
        print(f"Error loading dataset: {e}")
        raise


def identify_column_types(df):
    """
    Identify numeric and categorical columns

    Args:
        df: pandas DataFrame

    Returns:
        tuple of (numeric_columns, categorical_columns)
    """
    # Assuming the target column is named 'target' - adjust if needed
    feature_cols = [col for col in df.columns if col != 'target']

    numeric_cols = df[feature_cols].select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df[feature_cols].select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"Numeric features: {numeric_cols}")
    print(f"Categorical features: {categorical_cols}")

    return numeric_cols, categorical_cols


def create_preprocessing_pipeline(numeric_cols, categorical_cols):
    """
    Create a preprocessing pipeline for numeric and categorical features

    Args:
        numeric_cols: List of numeric column names
        categorical_cols: List of categorical column names

    Returns:
        ColumnTransformer preprocessing pipeline
    """
    # Numeric features pipeline: impute missing values and scale
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Categorical features pipeline: impute missing values and one-hot encode
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ],
        remainder='drop'  # Drop any columns not specified
    )

    return preprocessor


def build_and_train_model(X_train, y_train, preprocessor):
    """
    Build and train a logistic regression model without parallel processing

    Args:
        X_train: Training features
        y_train: Training target
        preprocessor: Data preprocessing pipeline

    Returns:
        Trained pipeline
    """
    # Create the full pipeline with preprocessing and model
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000, random_state=42))
    ])

    # Define hyperparameters for grid search
    param_grid = {
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__solver': ['liblinear', 'saga'],
        'classifier__penalty': ['l1', 'l2']
    }

    # Use GridSearchCV without parallel processing
    grid_search = GridSearchCV(
        model_pipeline,
        param_grid,
        cv=5,
        scoring='f1',
        verbose=1
    )

    print("\nTraining model with GridSearchCV...")
    grid_search.fit(X_train, y_train)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

    return grid_search


def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model on test data

    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target

    Returns:
        Dictionary with evaluation metrics
    """
    print("\nEvaluating model on test data...")

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print evaluation results
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    return {
        'accuracy': accuracy,
        'confusion_matrix': conf_matrix,
        'f1_score': f1
    }


def main(data_path='data.csv'):
    """
    Main function to run the entire pipeline

    Args:
        data_path: Path to the CSV file
    """
    start_time = time.time()

    # Load data
    data = load_data(data_path)

    # Assuming the target column is named 'target'
    target_col = 'target'
    X = data.drop(columns=[target_col])
    y = data[target_col]

    # Identify column types
    numeric_cols, categorical_cols = identify_column_types(data)

    # Create preprocessing pipeline
    preprocessor = create_preprocessing_pipeline(numeric_cols, categorical_cols)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Training set size: {X_train.shape}")
    print(f"Test set size: {X_test.shape}")

    # Build and train the model
    model = build_and_train_model(X_train, y_train, preprocessor)

    # Evaluate the model
    metrics = evaluate_model(model, X_test, y_test)

    # Calculate and print the total execution time
    end_time = time.time()
    total_time = end_time - start_time
    print(f"\nTotal execution time: {total_time:.2f} seconds")

    # Save the model (optional)
    # joblib.dump(model, 'logistic_regression_model.joblib')

    return metrics


if __name__ == "__main__":
    # You can change the data path if needed
    main(data_path='pdc_dataset_with_target.csv')

Loading data from pdc_dataset_with_target.csv...
Dataset shape: (41000, 8)
Numeric features: ['feature_1', 'feature_2', 'feature_4', 'feature_6', 'feature_7']
Categorical features: ['feature_3', 'feature_5']
Training set size: (32800, 7)
Test set size: (8200, 7)

Training model with GridSearchCV...
Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best parameters: {'classifier__C': 0.001, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Best cross-validation score: 0.0012

Evaluating model on test data...

Accuracy: 0.6020
F1 Score: 0.0012

Confusion Matrix:
[[4934    2]
 [3262    2]]

Classification Report:
              precision    recall  f1-score   support

           0       0.60      1.00      0.75      4936
           1       0.50      0.00      0.00      3264

    accuracy                           0.60      8200
   macro avg       0.55      0.50      0.38      8200
weighted avg       0.56      0.60      0.45      8200


Total execution time: 134.44 seconds


In [None]:

import os
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report
import joblib
from joblib import parallel_backend


def load_data(file_path):
    """
    Load dataset from CSV file

    Args:
        file_path: Path to the CSV file

    Returns:
        pandas DataFrame containing the dataset
    """
    print(f"Loading data from {file_path}...")
    try:
        data = pd.read_csv(file_path)
        print(f"Dataset shape: {data.shape}")
        return data
    except Exception as e:
        print(f"Error loading dataset: {e}")
        raise


def identify_column_types(df):
    """
    Identify numeric and categorical columns

    Args:
        df: pandas DataFrame

    Returns:
        tuple of (numeric_columns, categorical_columns)
    """
    # Assuming the target column is named 'target' - adjust if needed
    feature_cols = [col for col in df.columns if col != 'target']

    numeric_cols = df[feature_cols].select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df[feature_cols].select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"Numeric features: {numeric_cols}")
    print(f"Categorical features: {categorical_cols}")

    return numeric_cols, categorical_cols


def create_preprocessing_pipeline(numeric_cols, categorical_cols):
    """
    Create a preprocessing pipeline for numeric and categorical features

    Args:
        numeric_cols: List of numeric column names
        categorical_cols: List of categorical column names

    Returns:
        ColumnTransformer preprocessing pipeline
    """
    # Numeric features pipeline: impute missing values and scale
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Categorical features pipeline: impute missing values and one-hot encode
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ],
        remainder='drop'  # Drop any columns not specified
    )

    return preprocessor


def build_and_train_model(X_train, y_train, preprocessor, n_jobs=-1):
    """
    Build and train a logistic regression model with parallel processing

    Args:
        X_train: Training features
        y_train: Training target
        preprocessor: Data preprocessing pipeline
        n_jobs: Number of parallel jobs (-1 for all available cores)

    Returns:
        Trained pipeline
    """
    # Create the full pipeline with preprocessing and model
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000, random_state=42))
    ])

    # Define hyperparameters for grid search
    param_grid = {
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__solver': ['liblinear', 'saga'],
        'classifier__penalty': ['l1', 'l2']
    }

    # Use GridSearchCV with parallel processing
    with parallel_backend('threading', n_jobs=n_jobs):
        grid_search = GridSearchCV(
            model_pipeline,
            param_grid,
            cv=5,
            scoring='f1',
            n_jobs=n_jobs,
            verbose=1
        )

        print("\nTraining model with GridSearchCV and parallel processing...")
        grid_search.fit(X_train, y_train)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

    return grid_search


def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model on test data

    Args:
        model: Trained model
        X_test: Test features
        y_test: Test target

    Returns:
        Dictionary with evaluation metrics
    """
    print("\nEvaluating model on test data...")

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print evaluation results
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    return {
        'accuracy': accuracy,
        'confusion_matrix': conf_matrix,
        'f1_score': f1
    }


def main(data_path='data.csv'):
    """
    Main function to run the entire pipeline

    Args:
        data_path: Path to the CSV file
    """
    start_time = time.time()

    # Set the number of CPU cores to use (-1 for all available)
    n_cores = joblib.cpu_count()
    print(f"Running with {n_cores} CPU cores")

    # Load data
    data = load_data(data_path)

    # Assuming the target column is named 'target'
    target_col = 'target'
    X = data.drop(columns=[target_col])
    y = data[target_col]

    # Identify column types
    numeric_cols, categorical_cols = identify_column_types(data)

    # Create preprocessing pipeline
    preprocessor = create_preprocessing_pipeline(numeric_cols, categorical_cols)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Training set size: {X_train.shape}")
    print(f"Test set size: {X_test.shape}")

    # Build and train the model
    model = build_and_train_model(X_train, y_train, preprocessor, n_jobs=n_cores)

    # Evaluate the model
    metrics = evaluate_model(model, X_test, y_test)

    # Calculate and print the total execution time
    end_time = time.time()
    total_time = end_time - start_time
    print(f"\nTotal execution time: {total_time:.2f} seconds")

    # Save the model (optional)
    # joblib.dump(model, 'logistic_regression_cpu_model.joblib')

    return metrics


if __name__ == "__main__":
    # You can change the data path if needed
    main(data_path='pdc_dataset_with_target.csv')

Running with 2 CPU cores
Loading data from pdc_dataset_with_target.csv...
Dataset shape: (41000, 8)
Numeric features: ['feature_1', 'feature_2', 'feature_4', 'feature_6', 'feature_7']
Categorical features: ['feature_3', 'feature_5']
Training set size: (32800, 7)
Test set size: (8200, 7)

Training model with GridSearchCV and parallel processing...
Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best parameters: {'classifier__C': 0.001, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Best cross-validation score: 0.0012

Evaluating model on test data...

Accuracy: 0.6020
F1 Score: 0.0012

Confusion Matrix:
[[4934    2]
 [3262    2]]

Classification Report:
              precision    recall  f1-score   support

           0       0.60      1.00      0.75      4936
           1       0.50      0.00      0.00      3264

    accuracy                           0.60      8200
   macro avg       0.55      0.50      0.38      8200
weighted avg       0.56      0.60      0.45      8200


Total execution time: 126.73 seconds




In [None]:
import os
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import mixed_precision
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report

# Enable mixed precision for faster computation on modern GPUs
mixed_precision.set_global_policy('mixed_float16')

# GPU check
def check_gpu():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✅ GPU available: {len(gpus)} device(s)")
    else:
        print("⚠️ No GPU found. Using CPU.")
    return bool(gpus)

# Run GPU check at start
check_gpu()
print(f"TensorFlow {tf.__version__}, Policy: {mixed_precision.global_policy()}\n")

def load_and_preprocess(file_path):
    data = pd.read_csv(file_path)
    target_col = 'target'
    X = data.drop(columns=[target_col])
    y = data[target_col].values

    # Identify column types
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # Build preprocessing pipelines
    num_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preproc = ColumnTransformer([
        ('num', num_pipe, numeric_cols),
        ('cat', cat_pipe, categorical_cols)
    ], remainder='drop')

    # Fit-transform and split
    X_processed = preproc.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(
        X_processed, y, test_size=0.2, random_state=42, stratify=y)

    return (X_train, y_train), (X_test, y_test)


def make_tf_dataset(X, y, batch_size=256, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(X))
    return ds.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)


def build_model(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(1, activation='sigmoid', dtype='float32')  # Logistic regression
    ])
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy'],
        jit_compile=True
    )
    return model


def main(data_path='data.csv'):
    start = time.time()

    # Load & preprocess
    (X_train, y_train), (X_test, y_test) = load_and_preprocess(data_path)
    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

    # Create tf.data datasets
    train_ds = make_tf_dataset(X_train, y_train, shuffle=True)
    test_ds = make_tf_dataset(X_test, y_test)

    # Build & train
    model = build_model(X_train.shape[1])
    es = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)
    train_time_start = time.time()
    model.fit(
        train_ds,
        validation_data=test_ds,
        epochs=50,
        callbacks=[es],
        verbose=1
    )
    train_time = time.time() - train_time_start

    # Evaluate
    eval_start = time.time()
    loss, acc = model.evaluate(test_ds, verbose=0)
    eval_time = time.time() - eval_start
    y_pred = (model.predict(test_ds) > 0.5).astype(int).flatten()

    # Compute metrics
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\nResults:\n Accuracy: {acc:.4f}\n F1 Score: {f1:.4f}")
    print("Confusion Matrix :", cm)
    print(f"Training Time: {train_time:.2f}s, Evaluation Time: {eval_time:.2f}s")
    print(f"Total Elapsed: {time.time() - start:.2f}s")

    return {'accuracy': acc, 'f1_score': f1, 'confusion_matrix': cm}


if __name__ == '__main__':
    main('pdc_dataset_with_target.csv')


✅ GPU available: 1 device(s)
TensorFlow 2.18.0, Policy: <DTypePolicy "mixed_float16">

Train shape: (32800, 10), Test shape: (8200, 10)
Epoch 1/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.5137 - loss: 0.7667 - val_accuracy: 0.5206 - val_loss: 0.7381
Epoch 2/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5362 - loss: 0.7232 - val_accuracy: 0.5410 - val_loss: 0.7094
Epoch 3/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5558 - loss: 0.6992 - val_accuracy: 0.5655 - val_loss: 0.6927
Epoch 4/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5726 - loss: 0.6857 - val_accuracy: 0.5809 - val_loss: 0.6834
Epoch 5/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5942 - loss: 0.6784 - val_accuracy: 0.5917 - val_loss: 0.6782
Epoch 6/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━