# Goal: Predict if a customer will churn (Yes = 1, No = 0)

In [None]:
# Import Libraries and Load Data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

- use both RandomForestClassifier and LogisticRegression
- Try multiple algorithms Compare their performance Pick the best model for final predictions

- ### Accuracy_score
  - accuracy_score(y_true, y_pred) = how many predictions were correct
  - Accuracy = (Correct Predictions) / (Total Predictions)

- ### Classification_report
  - classification_report(y_true, y_pred) = Gives a detailed breakdown of model performance
  - Precision = How many predicted churned were actually churned?
  - Recall = How many actual churned customers were identified?
  - F1-score = Harmonic mean of precision and recall
  - Support = Count of actual samples for each class (0 and 1)

- ### Confusion_matrix
  - confusion_matrix(y_true, y_pred) = A 2x2 table that shows: Exact counts of TP, FP, FN, TN
  ### Confusion Matrix Explained
|                      | **Predicted: No (0)** | **Predicted: Yes (1)** |
|----------------------|-----------------------|-------------------------|
| **Actual: No (0)**   | ✅ True Negative (TN)  | ❌ False Positive (FP)  |
| **Actual: Yes (1)**  | ❌ False Negative (FN) | ✅ True Positive (TP)   |



In [None]:
# Load your cleaned CSV
df = pd.read_csv(r"C:\Users\bharg\Desktop\TEMP\Khushi Di project\DA assignment\CSV_files\purchase_behavior.csv")
df.head()

In [None]:
# Clean & Prepare Data
df = df.drop(columns=["customer_id", "customer_segment"])
# customer_id drop because it’s just a unique name or number
# customer_segment drop because it’s created using other features like total spend or order value

# Fill or drop missing values
df = df.dropna()

df.isnull().sum()

In [None]:
# Define Features and Target
# We are preparing the data for machine learning by splitting it into: X , y
# X → Features (the input the model will learn from) it is used to train 
# y → Target (the output the model will try to predict) it is use to check model's output
X = df.drop("churned", axis=1) # In X contain (total_spent, number_of_orders, days_since_last_purchase, avg_order_value)
y = df["churned"] # In y contain (churned)

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.3 , random_state = 42)
# test_size = 0.3 - 30% of the data will be used for testing & remaining 70% will be used to train the model.  (0.3 It's a standard choice)
# random_state = 42 - Machine learning splits data randomly (42 is commonly used by data scientists)

In [None]:
# Train Models
# Logistic Regression
lr = LogisticRegression(max_iter=1000) # Try up to 1000 times to get the best result
lr.fit(X_train, y_train)                # Train the model using training data
y_pred_lr = lr.predict(X_test)         # Predict churn on test data

In [None]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42) # Make a Random Forest model using 100 trees
rf.fit(X_train, y_train)            # Train the model
y_pred_rf = rf.predict(X_test)       # Predict churn

In [None]:
# Evaluate Performance
print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))

print("\nRandom Forest Report:")
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))

In [None]:
# Random Forest is better at catching customers who are about to leave.
# This is more helpful for a business because they can take action and try to stop those customers from leaving.

In [None]:
# Show Accuracy
print("Logistic Regression Accuracy:", round(accuracy_score(y_test, y_pred_lr),3))
print("Random Forest Accuracy:", round(accuracy_score(y_test, y_pred_rf),3))


### Logistic Regression has slightly higher accuracy (86.8%) than Random Forest (86.7%).
- Random Forest is still better at finding customers because who are likely to leave (churn), which is more important for the business than just accuracy

In [None]:
import matplotlib.pyplot as plt

importances = rf.feature_importances_     # Gets importance of each feature
features = X.columns                      # Gets the names of your features

plt.title("Feature Importance - Random Forest")   # Adds a title
plt.barh(features, importances)           # Draws a horizontal bar chart
plt.figure(figsize=(10, 5))               # Sets the size of the chart
plt.show()     

# This shows which features (like total_spent, days_since_last_purchase) are most important for predicting churn.
# The longer the bar, the more the model depends on that feature.

In [None]:
# Predict for New Data
# Format: [total_spent, number_of_orders, days_since_last_purchase, avg_order_value]
new_data = [[450.00, 3, 10, 150.00]]
prediction = rf.predict(new_data)
print("Will the customer churn? = ", prediction[0]) 
# What the Output Means:
# 0 → Not Churned
# 1 → Churned

- Churned (1)
  - He customer has left or stopped buying.
  - The business might lose this customer.
- Not Churned (0)
  - The customer is still active and buying.
  - They are loyal (for now).