In [2]:
import pandas as pd
import numpy as np
import time
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

# Ignore warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Step 1: Load data
print("Loading data...")
data = pd.read_csv('train.csv')
print("Data loaded successfully.")

# Step 2: Prepare features and target variable
print("Preparing features and target variable...")
X = data.drop(['id', 'target'], axis=1)
y = data['target']
print("Features and target variable prepared.")

# Step 3: Split data into training and test sets
print("Splitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split successfully.")

# Step 4: Encode categorical features
print("Encoding categorical features...")
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)
print("Categorical features encoded successfully.")

# Step 5: Scale numerical features
print("Scaling numerical features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Numerical features scaled successfully.")

# Step 6: Apply PCA to reduce feature dimensions
print("Applying PCA to reduce feature dimensions...")
pca = PCA(n_components=20)  # Reduce feature dimensions
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print("PCA applied successfully.")

# Step 7: Train Naive Bayes model
print("Training Naive Bayes model...")
nb_model = GaussianNB()
start_time = time.time()
nb_model.fit(X_train_pca, y_train)
end_time = time.time()
training_time = end_time - start_time
print(f"Naive Bayes model trained successfully in {training_time:.2f} seconds.")

# Step 8: Make predictions
print("Making predictions...")
y_pred_proba = nb_model.predict_proba(X_test_pca)[:, 1]
print("Predictions made successfully.")

# Step 9: Evaluate model performance
print("Evaluating model performance...")
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"Naive Bayes AUC: {auc_score:.4f}")
print(f"Training Time (seconds): {training_time:.4f}")



Loading data...
Data loaded successfully.
Preparing features and target variable...
Features and target variable prepared.
Splitting data into training and test sets...
Data split successfully.
Encoding categorical features...
Categorical features encoded successfully.
Scaling numerical features...
Numerical features scaled successfully.
Applying PCA to reduce feature dimensions...
PCA applied successfully.
Training Naive Bayes model...
Naive Bayes model trained successfully in 0.31 seconds.
Making predictions...
Predictions made successfully.
Evaluating model performance...
Naive Bayes AUC: 0.7727
Training Time (seconds): 0.3100
