FEATURE ENGINEERING FOR SAFARICOM CHURN PREDICTION

In [4]:
import pandas as pd

# Load the dataset
safaricom_data = pd.read_csv('safaricom_churn_data.csv')  

safaricom_data.head()

Unnamed: 0,Customer ID,Age,Gender,Tenure (Years),Region,Monthly Data Usage (MB),Call Duration (Minutes),SMS Sent,M-Pesa Transactions,Customer Service Interactions,...,Third Last Month Call Duration (Minutes),Last Month SMS Sent,Second Last Month SMS Sent,Third Last Month SMS Sent,Promotions,Payment Method,Data Rate per MB,Call Rate per Minute,SMS Rate per Message,Churn History
0,CUST0001,56,Male,2,Mombasa,6768,434,107,35,9,...,440,37,335,92,1,Credit Card,0.05,0.02,0.01,1
1,CUST0002,69,Male,10,Mombasa,1410,383,331,17,8,...,946,42,145,459,5,Credit Card,0.05,0.02,0.01,0
2,CUST0003,46,Female,9,Nakuru,4840,559,422,89,9,...,780,116,309,64,1,Debit Card,0.05,0.02,0.01,0
3,CUST0004,32,Female,8,Nakuru,4509,193,89,180,9,...,895,404,126,320,4,Mobile Money,0.05,0.02,0.01,0
4,CUST0005,60,Female,9,Nairobi,7128,735,204,198,0,...,919,474,174,9,5,Mobile Money,0.05,0.02,0.01,0


Features Added

1. Egagement Score

In [7]:
# Create an engagement score
safaricom_data['Engagement Score'] = (safaricom_data['Monthly Data Usage (MB)'] + 
                             safaricom_data['Call Duration (Minutes)'] + 
                             safaricom_data['SMS Sent'] + 
                             safaricom_data['M-Pesa Transactions'])

# Check the new feature
print(safaricom_data[['Engagement Score', 'Churn Indicator']].head())

   Engagement Score  Churn Indicator
0              7344                0
1              2141                0
2              5910                0
3              4971                0
4              8265                0


2. Average Monthly Usage

In [9]:
# Average Monthly Data Usage over the last three months
safaricom_data['Avg Monthly Data Usage (MB)'] = (
    safaricom_data['Last Month Data Usage (MB)'] +
    safaricom_data['Second Last Month Data Usage (MB)'] +
    safaricom_data['Third Last Month Data Usage (MB)']
) / 3

# Average Call Duration over the last three months
safaricom_data['Avg Call Duration (Minutes)'] = (
    safaricom_data['Last Month Call Duration (Minutes)'] +
    safaricom_data['Second Last Month Call Duration (Minutes)'] +
    safaricom_data['Third Last Month Call Duration (Minutes)']
) / 3

# Average SMS Sent over the last three months
safaricom_data['Avg SMS Sent'] = (
    safaricom_data['Last Month SMS Sent'] +
    safaricom_data['Second Last Month SMS Sent'] +
    safaricom_data['Third Last Month SMS Sent']
) / 3

In [10]:
safaricom_data['Monthly Spend'] = (safaricom_data['Monthly Data Usage (MB)'] * safaricom_data['Data Rate per MB'])
+ (safaricom_data['Call Duration (Minutes)'] * safaricom_data['Call Rate per Minute']) 
+ (safaricom_data['SMS Sent'] * safaricom_data['SMS Rate per Message'])

0      1.07
1      3.31
2      4.22
3      0.89
4      2.04
       ... 
995    2.77
996    2.02
997    0.65
998    1.07
999    2.67
Length: 1000, dtype: float64

3. Churn History

In [12]:
safaricom_data['Previous Churn'] = safaricom_data['Churn History'].apply(lambda x: 1 if x > 0 else 0)  # Assuming 'Churn History' is the number of times they have churned
safaricom_data['Previous Churn'] 

0      1
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    1
999    0
Name: Previous Churn, Length: 1000, dtype: int64

4. Churn Probability

In [14]:
# Churn Probability: This can be based on historical churn behavior
safaricom_data['Churn Probability'] = (
    safaricom_data['Churn Indicator'] +
    safaricom_data['Churn History']
) / 2

5. Tenure

In [16]:
# Tenure in Months
safaricom_data['Tenure (Months)'] = safaricom_data['Tenure (Years)'] * 12

6. Customer Interaction Ratio

In [18]:
# Customer Interaction Ratio
safaricom_data['Customer Interaction Ratio'] = safaricom_data['Customer Service Interactions'] / safaricom_data['Tenure (Months)']
safaricom_data['Customer Interaction Ratio']

0      0.375000
1      0.066667
2      0.083333
3      0.093750
4      0.000000
         ...   
995    0.125000
996    0.020833
997    0.000000
998    0.166667
999    0.058333
Name: Customer Interaction Ratio, Length: 1000, dtype: float64

7. Region and Gender Encoding

In [20]:
# Region Encoding
safaricom_data['Region Encoding'] = safaricom_data['Region'].astype('category').cat.codes

# Gender Encoding
safaricom_data['Gender Encoding'] = safaricom_data['Gender'].map({'Male': 1, 'Female': 0})

8. Promotions received

In [22]:
safaricom_data['Promotions Received'] = safaricom_data['Promotions'].apply(lambda x: 1 if x > 0 else 0)  

9.  Region-Based Features

In [24]:
# Example: Create a binary feature indicating if the customer is from a high churn region
high_churn_regions = ['RegionA', 'RegionB']  # Define high churn regions
safaricom_data['High Churn Region'] = safaricom_data['Region'].apply(lambda x: 1 if x in high_churn_regions else 0)
safaricom_data['High Churn Region']

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: High Churn Region, Length: 1000, dtype: int64

In [25]:
# Save the updated dataset
safaricom_data.to_csv('safaricom_data_engineered.csv', index=False)

In [26]:
safaricom_engineered = pd.read_csv('safaricom_data_engineered.csv')  

In [27]:
safaricom_engineered.head()

Unnamed: 0,Customer ID,Age,Gender,Tenure (Years),Region,Monthly Data Usage (MB),Call Duration (Minutes),SMS Sent,M-Pesa Transactions,Customer Service Interactions,...,Avg SMS Sent,Monthly Spend,Previous Churn,Churn Probability,Tenure (Months),Customer Interaction Ratio,Region Encoding,Gender Encoding,Promotions Received,High Churn Region
0,CUST0001,56,Male,2,Mombasa,6768,434,107,35,9,...,154.666667,338.4,1,0.5,24,0.375,1,1,1,0
1,CUST0002,69,Male,10,Mombasa,1410,383,331,17,8,...,215.333333,70.5,0,0.0,120,0.066667,1,1,1,0
2,CUST0003,46,Female,9,Nakuru,4840,559,422,89,9,...,163.0,242.0,0,0.0,108,0.083333,3,0,1,0
3,CUST0004,32,Female,8,Nakuru,4509,193,89,180,9,...,283.333333,225.45,0,0.0,96,0.09375,3,0,1,0
4,CUST0005,60,Female,9,Nairobi,7128,735,204,198,0,...,219.0,356.4,0,0.0,108,0.0,2,0,1,0
