In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from catboost import CatBoostClassifier, Pool

# 1. Data Loading and Preprocessing
# Load the dataset
df = pd.read_csv('customer.csv', sep='\t')

# Handle Missing Values (Income has ~24 missing values)
df['Income'] = df['Income'].fillna(df['Income'].median())

# Feature Engineering: Convert Date to Tenure (Days)
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='%d-%m-%Y')
max_date = df['Dt_Customer'].max()
df['Customer_Days'] = (max_date - df['Dt_Customer']).dt.days

# Drop non-predictive columns
# ID is an identifier, Dt_Customer is replaced by Customer_Days
# Z_CostContact and Z_Revenue are constant values in this specific dataset
df = df.drop(columns=['ID', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue'])

# Identify Categorical Features
# CatBoost handles these natively without One-Hot Encoding
categorical_features_indices = ['Education', 'Marital_Status']

# Split Data
X = df.drop(columns=['Response'])
y = df['Response']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Fitting/Training
# Initialize CatBoostClassifier
# We use standard Gradient Boosting parameters.
# To mimic Random Forest behavior, one would typically use 'Plain' boosting
# with subsampling, but the default Boosting is generally superior.
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,  # Print every 100 iterations
    random_seed=42
)

# Train the model, passing categorical features explicitly
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_test, y_test),
    plot=False
)

# 3. Evaluation
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))

# 4. Visualization (Code for creating plots)
# Feature Importance
feature_importance = model.get_feature_importance()
sorted_idx = np.argsort(feature_importance)[::-1]
plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), feature_importance[sorted_idx], align='center')
plt.xticks(range(X.shape[1]), np.array(X.columns)[sorted_idx], rotation=90)
plt.title('CatBoost Feature Importance')
plt.tight_layout()
plt.show()