In [1]:
import pandas as pd
from google.colab import files
uploaded = files.upload()
# Load the dataset
df = pd.read_csv('pdc_dataset_with_target.csv')


Saving pdc_dataset_with_target.csv to pdc_dataset_with_target.csv


In [2]:
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from scipy import stats

# Start timing the entire process - MAIN TIMER START
main_timer_start = time.time()
print("========== NEURAL NETWORK TRAINING WITH COMPLETE PREPROCESSING ==========")
print(f"Process started at: {time.strftime('%Y-%m-%d %H:%M:%S')}")
print("Using n_jobs=1 for consistent timing measurement")

# Using the existing dataframe 'df'
print("\n[1/5] LOADING AND INSPECTING DATA...")
timer_start = time.time()
print(f"Data shape: {df.shape}")
print(f"Data loaded and initial inspection completed in {time.time() - timer_start:.2f} seconds")

# EDA - Exploratory Data Analysis
print("\n[2/5] EXPLORATORY DATA ANALYSIS...")
timer_start = time.time()

# Checking for missing values
missing_values = df.isnull().sum()
print(f"Total missing values: {missing_values.sum()}")

# Basic statistics for numerical features
numerical_stats = df.select_dtypes(include=['number']).describe()

# Distribution of categorical features
categorical_columns = df.select_dtypes(include=['object']).columns
cat_distributions = {col: df[col].value_counts() for col in categorical_columns}

# Target distribution
target_distribution = df['target'].value_counts()
target_balance = df['target'].value_counts(normalize=True)

# Correlation analysis for numerical features
numerical_columns = df.select_dtypes(include=['number']).columns
correlation_matrix = df[numerical_columns].corr()

# Identifying outliers in numerical columns
outlier_summary = {}
for col in df.select_dtypes(include=['number']).columns:
    if col != 'target':
        z_scores = np.abs(stats.zscore(df[col].dropna()))
        outliers = np.where(z_scores > 3)[0]
        outlier_summary[col] = len(outliers)

print(f"EDA completed in {time.time() - timer_start:.2f} seconds")

# Preprocessing and Model Training
print("\n[3/5] DATA PREPROCESSING...")
timer_start = time.time()

# Split the data
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

# Identify column types
numeric_features = list(X.select_dtypes(include=['int64', 'float64']).columns)
categorical_features = list(X.select_dtypes(include=['object']).columns)

# Create preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing
print(f"Preprocessing completed in {time.time() - timer_start:.2f} seconds")

# Set up the full pipeline with Neural Network
print("\n[4/5] MODEL TRAINING...")
timer_start = time.time()

# Define the neural network pipeline with PCA
nn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),  # Keep 95% of variance
    ('classifier', MLPClassifier(random_state=42, max_iter=1000))
])

# Parameters for grid search
param_grid = {
    'classifier__hidden_layer_sizes': [(10,), (20,), (10, 10)],
    'classifier__activation': ['relu', 'tanh'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__solver': ['adam', 'sgd']
}

# Grid search with cross-validation - using n_jobs=1 as requested
grid_search = GridSearchCV(nn_pipeline, param_grid, cv=5, n_jobs=1, verbose=1)

# Train the model
print("Training neural network with grid search...")
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Model training completed in {time.time() - timer_start:.2f} seconds")

# Evaluate the model
print("\n[5/5] MODEL EVALUATION...")
timer_start = time.time()

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.4f}")

# Classification Report
report = classification_report(y_test, y_pred)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Feature importance analysis
try:
    # Get transformed column names after preprocessing
    preprocessed_X_train = preprocessor.fit_transform(X_train)

    # Get feature names after one-hot encoding
    ohe_feature_names = []
    for name, trans, cols in preprocessor.transformers_:
        if name == 'cat':
            ohe_feature_names.extend(trans.named_steps['onehot'].get_feature_names_out(cols))
        else:
            ohe_feature_names.extend(cols)

    # For NNs we don't have direct feature importance, but we can look at weights of first layer
    first_layer_weights = np.abs(best_model.named_steps['classifier'].coefs_[0]).mean(axis=1)
    pca = best_model.named_steps['pca']

    # Transform the feature importance through PCA
    feature_importance = np.zeros(len(ohe_feature_names))
    for i, weight in enumerate(first_layer_weights):
        feature_importance += weight * np.abs(pca.components_[i])

    # Top features by importance
    importance_df = pd.DataFrame({
        'Feature': ohe_feature_names,
        'Importance': feature_importance
    }).sort_values('Importance', ascending=False)

    top_features = importance_df.head(5)
except Exception as e:
    top_features = "Could not calculate feature importance"

print(f"Evaluation completed in {time.time() - timer_start:.2f} seconds")

# Total computation time - MAIN TIMER END
total_time = time.time() - main_timer_start
print(f"\n========== SUMMARY ==========")
print(f"Process completed at: {time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total computation time: {total_time:.2f} seconds")
print(f"Final model accuracy: {accuracy:.4f}")
print(f"Best neural network configuration: {grid_search.best_params_}")

# Print detailed classification report
print("\nClassification Report:")
print(report)

print("\nConfusion Matrix:")
print(conf_matrix)

if isinstance(top_features, pd.DataFrame):
    print("\nTop 5 Most Important Features:")
    print(top_features)

print("\n========== PROCESS COMPLETED ==========")

Process started at: 2025-05-04 04:08:03
Using n_jobs=1 for consistent timing measurement

[1/5] LOADING AND INSPECTING DATA...
Data shape: (41000, 8)
Data loaded and initial inspection completed in 0.00 seconds

[2/5] EXPLORATORY DATA ANALYSIS...
Total missing values: 8194
EDA completed in 0.07 seconds

[3/5] DATA PREPROCESSING...
Training set shape: (32800, 7), Testing set shape: (8200, 7)
Preprocessing completed in 0.03 seconds

[4/5] MODEL TRAINING...
Training neural network with grid search...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters: {'classifier__activation': 'tanh', 'classifier__alpha': 0.001, 'classifier__hidden_layer_sizes': (10,), 'classifier__solver': 'adam'}
Model training completed in 372.05 seconds

[5/5] MODEL EVALUATION...
Accuracy on test set: 0.6017
Evaluation completed in 0.13 seconds

Process completed at: 2025-05-04 04:14:15
Total computation time: 372.29 seconds
Final model accuracy: 0.6017
Best neural network configuration: {'