In [3]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

# Step 1: Load train and test datasets
train_df = pd.read_csv('train.csv')  # Adjust path if necessary
test_df = pd.read_csv('test.csv')  # Adjust path if necessary

# Step 2: Select numerical columns from the train dataset and target variable
X_train = train_df.select_dtypes(include=['number']).drop('Exited', axis=1)
y_train = train_df['Exited']

# Step 3: Select numerical columns from the test dataset
X_test = test_df.select_dtypes(include=['number'])  # Assumes no 'Exited' column in the test set

# Step 4: Feature Selection (Select top 10 features based on the train data)
selector = SelectKBest(f_classif, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Step 5: Add Polynomial Features (degree=2) to both train and test sets
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_selected)
X_test_poly = poly.transform(X_test_selected)

# Step 6: Standardize both the train and test data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

# Step 7: Dimensionality Reduction using PCA
n_components = min(X_train_scaled.shape[0], X_train_scaled.shape[1], 65)  # Use 65 as an upper limit
pca = PCA(n_components=n_components)
X_train_reduced = pca.fit_transform(X_train_scaled)
X_test_reduced = pca.transform(X_test_scaled)

# Step 8: Hyperparameter Tuning using GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l2'],  # L2 penalty (Ridge-like)
    'solver': ['lbfgs'],  # Solver for optimization
    'max_iter': [500, 1000]
}

# Step 9: Cross-validation and model training
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
logistic_model = LogisticRegression()

grid_search = GridSearchCV(logistic_model, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_reduced, y_train)

# Best Logistic Regression model after cross-validation
best_model = grid_search.best_estimator_

# Step 10: Evaluate model on the training set
y_train_pred = best_model.predict(X_train_reduced)
y_train_pred_proba = best_model.predict_proba(X_train_reduced)[:, 1]

accuracy_train = accuracy_score(y_train, y_train_pred)
roc_auc_train = roc_auc_score(y_train, y_train_pred_proba)

print(f'Best Hyperparameters: {grid_search.best_params_}')
print(f'Training Accuracy: {accuracy_train:.4f}')
print(f'Training ROC-AUC: {roc_auc_train:.4f}')

# Step 11: Make predictions on the test set
test_pred_proba = best_model.predict_proba(X_test_reduced)[:, 1]  # Probability of exiting (class 1)

# Round the predicted probabilities to 2 decimal places
test_pred = np.round(test_pred_proba, decimals=2)

# Step 12: Save the predictions in the test DataFrame
test_df['Exited'] = test_pred

# Ensure correct column names are used for output (replace 'CustomerId' with the actual identifier)
# If your test file has a different identifier column, adjust 'CustomerId' to the correct name
if 'CustomerId' in test_df.columns:
    test_df.rename(columns={'CustomerId': 'id'}, inplace=True)
    test_df[['id', 'Exited']].to_csv('submission5.csv', index=False)
else:
    print("Error: 'CustomerId' column not found in test data. Adjust the identifier column name.")
    test_df.to_csv('submission5.csv', index=False)  # Save the entire test DataFrame if 'CustomerId' not found

print("Submission file saved as 'submission5.csv'.")


Best Hyperparameters: {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}
Training Accuracy: 0.8509
Training ROC-AUC: 0.8653
Submission file saved as 'submission5.csv'.
