# Customer Churn Prediction

### Reading Data

In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv(r"data\customer_churn_dataset-testing-master.csv")
df

### Checking Null Values

In [None]:
print("Null Values -")
df.isnull().sum()

## Analysis

### 1. Bar chart of Churn by Subscription Type

In [None]:

plt.figure(figsize=(6,4))
sns.countplot(x='Subscription Type', hue='Churn', data=df)
plt.title('Churn by Subscription Type')
plt.show()

### 2. Histogram of Age Distribution

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['Age'], bins=10)
plt.title('Age Distribution')
plt.show()

### 3. Bar chart of Churn by Gender

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='Gender', hue='Churn', data=df)
plt.title('Churn by Gender')
plt.show()


### 4. Box plot of Usage Frequency by Gender

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x='Gender', y='Usage Frequency', data=df)
plt.title('Usage Frequency by Gender')
plt.show()

### 5. Line chart of Contract Length vs. Payment Delay

In [None]:
plt.figure(figsize=(6,4))
sns.lineplot(x='Contract Length', y='Payment Delay', data=df)
plt.title('Contract Length vs. Payment Delay')
plt.show()

## Prediction

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Load train and test datasets
train_data = pd.read_csv('data/customer_churn_dataset-training-master.csv')
test_data = pd.read_csv('data/customer_churn_dataset-testing-master.csv')

# 2. Feature Engineering
def feature_engineering(data):
    # One-hot encoding categorical columns
    data = pd.get_dummies(data, columns=['Gender', 'Subscription Type', 'Contract Length'], drop_first=True)
    # Create new feature 'Spend_per_Tenure'
    data['Spend_per_Tenure'] = data['Total Spend'] / data['Tenure']
    # Drop unnecessary columns
    data = data.drop(columns=['CustomerID', 'Last Interaction'])
    return data

# Apply feature engineering to both train and test datasets
train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)

# 3. Check for NaN values in the target variable
if train_data['Churn'].isnull().any():
    print("NaN values found in y_train (Churn). Dropping rows...")
    train_data = train_data.dropna(subset=['Churn'])

if test_data['Churn'].isnull().any():
    print("NaN values found in y_test (Churn). Dropping rows...")
    test_data = test_data.dropna(subset=['Churn'])

# 4. Split features and target
X_train = train_data.drop(columns=['Churn'])  # Features from train data
y_train = train_data['Churn']                  # Target from train data

X_test = test_data.drop(columns=['Churn'])     # Features from test data
y_test = test_data['Churn']                     # Target from test data

# 5. Handle missing values (imputation) in features
imputer = SimpleImputer(strategy='mean')  # Impute missing values with the mean
X_train = imputer.fit_transform(X_train)  # Apply imputation to training data
X_test = imputer.transform(X_test)        # Apply same imputation to test data

# Optional: Standardize the features (scaling can help some models perform better)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit scaler on train data
X_test = scaler.transform(X_test)        # Use the same scaler on test data

# 6. Initialize models
log_reg = LogisticRegression()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
lightgbm = LGBMClassifier()

# 7. Train and evaluate models
models = {
    'Logistic Regression': log_reg,
    'Decision Tree': decision_tree,
    'Random Forest': random_forest,
    'XGBoost': xgboost,
    'LightGBM': lightgbm
}

# Dictionary to store model performances
model_performance = {}

for name, model in models.items():
    # Train the model on training data
    model.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = model.predict(X_test)
    
    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}\n")
    
    # Store accuracy for comparison
    model_performance[name] = accuracy

# 8. Compare models and display the best one
best_model = max(model_performance, key=model_performance.get)
print(f"The best model is: {best_model} with accuracy of {model_performance[best_model]:.4f}")