### 1. Import necessary libraries
---

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

### 2. Load data
---

In [None]:
train_cleaned = pd.read_csv('https://raw.githubusercontent.com/Ahmed-M-Fayad/Customer-Churn-Prediction-and-Analysis/main/Data/train_cleaned.csv')
train_cleaned.head()

Unnamed: 0,age,gender,region_category,membership_category,joining_date,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,last_visit_time,...,avg_time_spent,avg_transaction_value,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score,log_avg_time_spent
0,18,F,Village,Platinum Membership,2017-08-17,No,Gift Vouchers/Coupons,Smartphone,Wi-Fi,1900-01-01 16:08:02,...,5.709201,53005.25,781.75,Yes,Yes,No,Not Applicable,Products always in Stock,2.0,5.709201
1,32,F,City,Premium Membership,2017-08-28,No,Gift Vouchers/Coupons,Desktop,Mobile_Data,1900-01-01 12:38:13,...,5.727955,12838.38,788.482817,Yes,No,Yes,Solved,Quality Customer Care,1.0,5.727955
2,44,F,Town,No Membership,2016-11-11,Yes,Gift Vouchers/Coupons,Desktop,Wi-Fi,1900-01-01 22:53:21,...,6.248352,21027.0,500.69,No,Yes,Yes,Solved in Follow-up,Poor Website,5.0,6.248352
3,37,M,City,No Membership,2016-10-29,Yes,Gift Vouchers/Coupons,Desktop,Mobile_Data,1900-01-01 15:57:50,...,3.993972,25239.56,567.66,No,Yes,Yes,Unsolved,Poor Website,5.0,3.993972
4,31,F,City,No Membership,2017-09-12,No,Credit/Debit Card Offers,Smartphone,Mobile_Data,1900-01-01 15:46:44,...,4.737338,24483.66,663.06,No,Yes,Yes,Solved,Poor Website,5.0,4.737338


#### 2.1 Create a copy for feature engineering

In [3]:
train_preprocessed = train_cleaned.copy()

## 3. Feature Engineering
---

### 3.1 Explore columns

In [6]:
train_preprocessed.columns

Index(['age', 'gender', 'region_category', 'membership_category',
       'joining_date', 'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score', 'log_avg_time_spent'],
      dtype='object')

In [7]:
train_preprocessed.select_dtypes('number').columns

Index(['age', 'days_since_last_login', 'avg_time_spent',
       'avg_transaction_value', 'points_in_wallet', 'churn_risk_score',
       'log_avg_time_spent'],
      dtype='object')

### 3.2 Extracting features

In [8]:
train_preprocessed['points_per_transaction'] = train_preprocessed['points_in_wallet'] / train_preprocessed['avg_transaction_value']
train_preprocessed['transaction_value_per_time_unit'] = train_preprocessed['avg_transaction_value'] / train_preprocessed['log_avg_time_spent']

In [9]:
def time_of_day(hour):
	if 5 <= hour < 12:
		return 'Morning'
	elif 12 <= hour < 17:
		return 'Afternoon'
	elif 17 <= hour < 21:
		return 'Evening'
	else:
		return 'Night'

In [11]:
train_preprocessed['last_visit_hour'] = pd.to_datetime(train_preprocessed['last_visit_time']).dt.hour
train_preprocessed['last_visit_time_of_day'] = train_preprocessed['last_visit_hour'].apply(time_of_day).astype('category')

### Save data (ready for visualization & dashboards)

In [12]:
train_preprocessed.isna().sum()

age                                0
gender                             0
region_category                    0
membership_category                0
joining_date                       0
joined_through_referral            0
preferred_offer_types              0
medium_of_operation                0
internet_option                    0
last_visit_time                    0
days_since_last_login              0
avg_time_spent                     0
avg_transaction_value              0
points_in_wallet                   0
used_special_discount              0
offer_application_preference       0
past_complaint                     0
complaint_status                   0
feedback                           0
churn_risk_score                   0
log_avg_time_spent                 0
points_per_transaction             0
transaction_value_per_time_unit    0
last_visit_hour                    0
last_visit_time_of_day             0
dtype: int64

In [None]:
train_preprocessed.to_csv('train_preprocessed_plusFeatures.csv', index=False)

### 3.3 Encoding categorical variables

In [14]:
for col in train_preprocessed.select_dtypes(exclude='number').columns:
	print(f'{col} Column', '-'*50)
	print(train_preprocessed[col].unique())

gender Column --------------------------------------------------
['F' 'M' 'Unknown']
region_category Column --------------------------------------------------
['Village' 'City' 'Town']
membership_category Column --------------------------------------------------
['Platinum Membership' 'Premium Membership' 'No Membership'
 'Gold Membership' 'Silver Membership' 'Basic Membership']
joining_date Column --------------------------------------------------
['2017-08-17' '2017-08-28' '2016-11-11' ... '2017-12-11' '2016-09-25'
 '2017-04-15']
joined_through_referral Column --------------------------------------------------
['No' 'Yes']
preferred_offer_types Column --------------------------------------------------
['Gift Vouchers/Coupons' 'Credit/Debit Card Offers' 'Without Offers']
medium_of_operation Column --------------------------------------------------
['Smartphone' 'Desktop' 'Both']
internet_option Column --------------------------------------------------
['Wi-Fi' 'Mobile_Data' 'Fiber_Opt

#### 3.3.1 Binary Encoding

In [15]:
binary_cols = ['gender', 'joined_through_referral', 'used_special_discount', 'offer_application_preference', 'past_complaint']
for col in binary_cols:
	train_preprocessed[col] = train_preprocessed[col].map({'F':0, 'Unknown':0.5, 'M':1, 'No':0, 'Yes':1})

#### 3.3.2 Ordinal Encoding

In [16]:
# ordinal_cols = ['membership_category', 'feedback']
train_preprocessed['membership_category'] = pd.Categorical(train_preprocessed['membership_category'], 
														   categories=['No Membership', 'Basic Membership', 'Silver Membership', 
																	   'Gold Membership', 'Platinum Membership', 'Premium Membership'],
															ordered=True).codes

In [17]:
positive_feedback = ['Products always in Stock', 'Quality Customer Care', 'Reasonable Price', 'User Friendly Website']
negative_feedback = ['Poor Website' ,'Poor Customer Service', 'Poor Product Quality', 'Too many ads']
neutral_feedback = ['No reason specified']

def get_sentiment(feedback):
	if feedback in positive_feedback:
		return 2
	elif feedback in negative_feedback:
		return 0
	else:
		return 1 # neutral
	
train_preprocessed['feedback'] = train_preprocessed['feedback'].transform(get_sentiment)

#### 3.3.3 One-Hot Encoding

In [18]:
cat_cols = ['region_category', 'preferred_offer_types', 'medium_of_operation', 'internet_option', 'last_visit_time_of_day', 'complaint_status']
train_preprocessed = pd.get_dummies(train_preprocessed, columns=cat_cols, drop_first=True, dtype=int)

#### 3.3.4 Feature selection

In [19]:
X = train_preprocessed.drop(columns=['churn_risk_score', 'joining_date', 'last_visit_time'])
y = train_preprocessed['churn_risk_score']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [20]:
# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [21]:
rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)

In [22]:
importances = rf.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'feature':feature_names, 'importance':importances})
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False).reset_index()
feature_importance_df['cumulative_importance'] = feature_importance_df['importance'].cumsum()
feature_importance_df

Unnamed: 0,index,feature,importance,cumulative_importance
0,7,points_in_wallet,0.263515,0.263515
1,2,membership_category,0.221614,0.485128
2,11,feedback,0.11492,0.600048
3,6,avg_transaction_value,0.063788,0.663836
4,13,points_per_transaction,0.052865,0.716701
5,14,transaction_value_per_time_unit,0.042772,0.759473
6,12,log_avg_time_spent,0.033696,0.793169
7,5,avg_time_spent,0.032599,0.825768
8,0,age,0.031241,0.857009
9,4,days_since_last_login,0.027157,0.884166


In [23]:
# let's select the first 11 features where they have importance of around 90%
important_features = feature_importance_df.iloc[:10]['index'].tolist()
X_train_selected = X_train[:, important_features]
X_val_selected = X_val[:, important_features]

In [24]:
rf_selected = RandomForestClassifier(random_state=42).fit(X_train_selected, y_train)

full_score = rf.score(X_val, y_val)
selected_score = rf_selected.score(X_val_selected, y_val)

print(f"accuracy (before feature selection): {full_score:.4f}")
print(f"accuracy (after feature selection): {selected_score:.4f}")

accuracy (before feature selection): 0.7801
accuracy (after feature selection): 0.7781


### Save data (ready for modeling)

In [25]:
train_preprocessed.to_csv('train_preprocessed_plusFeatures_encoded.csv', index=False)