In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
# Helper function to collect customer details
def collect_customer_details(features):
    print("\nLet's collect some information about the customer...")
    
    # Initialize empty customer profile
    customer_profile = {feature: 0 for feature in features}
    
    # Dictionary of questions for basic customer info
    customer_questions = {
        'CreditScore': 'What is their credit score? (300-850): ',
        'Age': 'Customer age: ',
        'Tenure': 'Years with the bank: ',
        'Balance': 'Current account balance: ',
        'NumOfProducts': 'Number of bank products (1-4): ',
        'HasCrCard': 'Do they have a credit card? (1=Yes, 0=No): ',
        'IsActiveMember': 'Are they an active member? (1=Yes, 0=No): ',
        'EstimatedSalary': 'Estimated yearly salary: '
    }
    
    # Collect numerical data
    print("\n--- Basic Information ---")
    for feature, question in customer_questions.items():
        while True:
            try:
                value = float(input(question))
                # Basic validation
                if feature == 'CreditScore' and not (300 <= value <= 850):
                    print("Credit score must be between 300 and 850!")
                    continue
                if feature == 'Age' and not (18 <= value <= 100):
                    print("Age must be between 18 and 100!")
                    continue
                if feature == 'NumOfProducts' and not (1 <= value <= 4):
                    print("Number of products must be between 1 and 4!")
                    continue
                customer_profile[feature] = value
                break
            except ValueError:
                print("Please enter a valid number!")
    
    # Get location info
    print("\n--- Location Information ---")
    while True:
        location = input("Customer's country (France/Spain/Germany): ").capitalize()
        if location in ['France', 'Spain', 'Germany']:
            location_cols = [col for col in features if col.startswith('Geography_')]
            for col in location_cols:
                country = col.split('_')[1]
                customer_profile[col] = 1 if location == country else 0
            break
        print("Please select from France, Spain, or Germany!")
    
    # Get gender info
    print("\n--- Personal Information ---")
    while True:
        gender = input("Customer's gender (Male/Female): ").capitalize()
        if gender in ['Male', 'Female']:
            gender_cols = [col for col in features if col.startswith('Gender_')]
            for col in gender_cols:
                gen = col.split('_')[1]
                customer_profile[col] = 1 if gender == gen else 0
            break
        print("Please enter either Male or Female!")
    
    # Return organized customer data
    return pd.DataFrame([customer_profile])[features]


In [None]:
def main():
    # Load and prep the data
    print("Loading customer database...")
    bank_data = pd.read_csv('Churn_Modelling.csv')

    # Remove unnecessary columns
    useful_data = bank_data.drop(['Surname', 'RowNumber', 'CustomerId'], axis=1)
    
    # Convert text data to numbers
    processed_data = pd.get_dummies(useful_data, columns=['Geography', 'Gender'])

    # Split data into what we want to predict and what we use to predict
    churn_status = processed_data['Exited']
    prediction_features = processed_data.drop('Exited', axis=1)
    feature_list = prediction_features.columns.tolist()

    # Create training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        prediction_features, churn_status, 
        test_size=0.2,  # Use 80% for training
        random_state=42  # For reproducible results
    )

    # Scale our data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train the model
    print("Training the prediction model...")
    churn_predictor = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    churn_predictor.fit(X_train_scaled, y_train)
    print("Model training complete!")

    # Main program loop
    while True:
        print("\nChurn Prediction Menu:")
        print("1. Predict churn for new customer")
        print("2. Check model accuracy")
        print("3. Show important factors")
        print("4. Quit")
        
        user_choice = input("\nWhat would you like to do? (1-4): ")

        if user_choice == '1':
            # Make new prediction
            new_customer = collect_customer_details(feature_list)
            scaled_customer_data = scaler.transform(new_customer)
            
            # Get prediction and confidence
            will_churn = churn_predictor.predict(scaled_customer_data)
            churn_probability = churn_predictor.predict_proba(scaled_customer_data)
            
            # Show results
            print("\nPrediction Results:")
            if will_churn[0] == 1:
                print("Warning: Customer is likely to leave!")
            else:
                print("Good news! Customer is likely to stay.")
            print(f"Confidence: {churn_probability[0][1]:.1%}")

        elif user_choice == '2':
            # Show accuracy metrics
            test_predictions = churn_predictor.predict(X_test_scaled)
            print(f"\nModel Accuracy: {accuracy_score(y_test, test_predictions):.1%}")
            
            # Create confusion matrix visualization
            plt.figure(figsize=(8, 6))
            conf_matrix = confusion_matrix(y_test, test_predictions)
            sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                       xticklabels=['Stayed', 'Left'], 
                       yticklabels=['Stayed', 'Left'])
            plt.title('Prediction Accuracy Breakdown')
            plt.xlabel('Predicted Outcome')
            plt.ylabel('Actual Outcome')
            plt.show()

        elif user_choice == '3':
            # Show feature importance
            importance_data = pd.DataFrame({
                'Factor': prediction_features.columns,
                'Impact': churn_predictor.feature_importances_
            }).sort_values('Impact', ascending=False)
            
            plt.figure(figsize=(10, 6))
            sns.barplot(data=importance_data.head(10), x='Impact', y='Factor')
            plt.title('Top 10 Factors Influencing Customer Churn')
            plt.xlabel('Importance Level')
            plt.tight_layout()
            plt.show()

        elif user_choice == '4':
            print("\nThanks for using the Churn Predictor! Goodbye!")
            break

        else:
            print("\nInvalid choice! Please enter a number between 1 and 4.")

if __name__ == "__main__":
    main()

Loading customer database...
Training the prediction model...
Model training complete!

Churn Prediction Menu:
1. Predict churn for new customer
2. Check model accuracy
3. Show important factors
4. Quit
