In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.sparse import hstack, csr_matrix
import time
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Step 1: Load data
print("Loading data...")
data_path = 'train.csv'
df = pd.read_csv(data_path)
print("Data loaded successfully.")

# Step 2: Data preprocessing
print("Preprocessing data...")
target = 'target'
features = [col for col in df.columns if col not in ['id', target]]
X = df[features]
y = df[target]

categorical_features = [col for col in X.columns if col.startswith('cat_')]
numerical_features = [col for col in X.columns if col.startswith('num_')]

# Encode categorical features
print("Encoding categorical features...")
encoder = OneHotEncoder(handle_unknown='ignore')
X_categorical_encoded = encoder.fit_transform(X[categorical_features])
print("Categorical features encoded successfully.")

# Scale numerical features (if any)
if numerical_features:
    print("Scaling numerical features...")
    scaler = StandardScaler()
    X_numerical_scaled = scaler.fit_transform(X[numerical_features])
    # Convert numerical features to sparse matrix to save memory
    X_numerical_sparse = csr_matrix(X_numerical_scaled.astype(np.float32))
    print("Numerical features scaled successfully.")
else:
    X_numerical_sparse = None

# Combine processed features
print("Combining processed features...")
if X_numerical_sparse is not None:
    X_processed = hstack([X_categorical_encoded, X_numerical_sparse]).tocsr()
else:
    X_processed = X_categorical_encoded
print("Features combined successfully.")

# Step 3: Split data into training and test sets
print("Splitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
print("Data split successfully.")

# Step 4: Build and train the logistic regression model
print("Training the model...")
start_time = time.time()
model = LogisticRegression(max_iter=5000, solver='lbfgs')
model.fit(X_train, y_train)
end_time = time.time()
training_time = end_time - start_time
print(f"Model trained successfully in {training_time:.2f} seconds")

# Step 5: Predict and evaluate
print("Evaluating the model...")
y_pred_proba = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score on the test set: {auc_score:.4f}")

y_pred = model.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Loading data...
Data loaded successfully.
Preprocessing data...
Encoding categorical features...
Categorical features encoded successfully.
Scaling numerical features...
Numerical features scaled successfully.
Combining processed features...
Features combined successfully.
Splitting data into training and test sets...
Data split successfully.
Training the model...
Model trained successfully in 128.66 seconds
Evaluating the model...
AUC Score on the test set: 0.8738

Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      1.00      0.99    157853
         1.0       0.40      0.01      0.02      2147

    accuracy                           0.99    160000
   macro avg       0.70      0.50      0.50    160000
weighted avg       0.98      0.99      0.98    160000

