In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [4]:
data = pd.read_csv('Expresso_churn_dataset (1).csv')
data.head()

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,K > 24 month,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,NO,54,On net 200F=Unlimited _call24H,8.0,0
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,,I 18-21 month,,,,,,,,,,,,NO,4,,,1
2,00001654a9d9f96303d9969d0a4a851714a4bb57,,K > 24 month,3600.0,2.0,1020.0,340.0,2.0,,90.0,46.0,7.0,,,NO,17,On-net 1000F=10MilF;10d,1.0,0
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,K > 24 month,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,,,NO,62,"Data:1000F=5GB,7d",11.0,0
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,K > 24 month,1000.0,1.0,985.0,328.0,1.0,,39.0,24.0,,,,NO,11,Mixt 250F=Unlimited_call24H,2.0,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2154048 entries, 0 to 2154047
Data columns (total 19 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         object 
 1   REGION          object 
 2   TENURE          object 
 3   MONTANT         float64
 4   FREQUENCE_RECH  float64
 5   REVENUE         float64
 6   ARPU_SEGMENT    float64
 7   FREQUENCE       float64
 8   DATA_VOLUME     float64
 9   ON_NET          float64
 10  ORANGE          float64
 11  TIGO            float64
 12  ZONE1           float64
 13  ZONE2           float64
 14  MRG             object 
 15  REGULARITY      int64  
 16  TOP_PACK        object 
 17  FREQ_TOP_PACK   float64
 18  CHURN           int64  
dtypes: float64(12), int64(2), object(5)
memory usage: 312.2+ MB


In [6]:
data.isnull().sum()

user_id                 0
REGION             849299
TENURE                  0
MONTANT            756739
FREQUENCE_RECH     756739
REVENUE            726048
ARPU_SEGMENT       726048
FREQUENCE          726048
DATA_VOLUME       1060433
ON_NET             786675
ORANGE             895248
TIGO              1290016
ZONE1             1984327
ZONE2             2017224
MRG                     0
REGULARITY              0
TOP_PACK           902594
FREQ_TOP_PACK      902594
CHURN                   0
dtype: int64

In [7]:
# Splitting into numerical and categorical columns

cat = data.select_dtypes('object')
num = data.select_dtypes('number')

# filling categorical columns with the modal value of the column
for i in data.columns:
    if i in cat.columns:
        data[i] = data[i].fillna(data[i].mode()[0])
    elif i in num.columns:
        data[i] = data[i].fillna(data[i].mean())

In [9]:
data.columns

Index(['user_id', 'REGION', 'TENURE', 'MONTANT', 'FREQUENCE_RECH', 'REVENUE',
       'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO',
       'ZONE1', 'ZONE2', 'MRG', 'REGULARITY', 'TOP_PACK', 'FREQ_TOP_PACK',
       'CHURN'],
      dtype='object')

In [10]:
cat.columns

Index(['user_id', 'REGION', 'TENURE', 'MRG', 'TOP_PACK'], dtype='object')

In [11]:
num.columns

Index(['MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE',
       'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2',
       'REGULARITY', 'FREQ_TOP_PACK', 'CHURN'],
      dtype='object')

In [12]:
data['TENURE'].value_counts()

TENURE
K > 24 month     2043201
I 18-21 month      45278
H 15-18 month      26006
G 12-15 month      14901
J 21-24 month      12725
F 9-12 month        9328
E 6-9 month         1839
D 3-6 month          770
Name: count, dtype: int64

In [13]:
tenure_dict = {'K > 24 month': 24,
               'J 21-24 month': 21,
               'I 18-21 month': 18,
               'H 15-18 month': 15,
               'G 12-15 month': 12,
               'F 9-12 month' : 9,
               'E 6-9 month': 6,
               'D 3-6 month': 3
               }

data['TENURE'] = data['TENURE'].map(tenure_dict)
data['TENURE'].value_counts()

TENURE
24    2043201
18      45278
15      26006
12      14901
21      12725
9        9328
6        1839
3         770
Name: count, dtype: int64

In [14]:
data['TOP_PACK'].unique()

array(['On net 200F=Unlimited _call24H', 'All-net 500F=2000F;5d',
       'On-net 1000F=10MilF;10d', 'Data:1000F=5GB,7d',
       'Mixt 250F=Unlimited_call24H',
       'MIXT:500F= 2500F on net _2500F off net;2d', 'On-net 500F_FNF;3d',
       'Data: 100 F=40MB,24H', 'MIXT: 200mnoff net _unl on net _5Go;30d',
       'Jokko_Daily', 'Data: 200 F=100MB,24H', 'Data:490F=1GB,7d',
       'Twter_U2opia_Daily', 'On-net 500=4000,10d', 'Data:1000F=2GB,30d',
       'IVR Echat_Daily_50F', 'Pilot_Youth4_490',
       'All-net 500F =2000F_AllNet_Unlimited', 'Twter_U2opia_Weekly',
       'Data:200F=Unlimited,24H', 'On-net 200F=60mn;1d',
       'All-net 600F= 3000F ;5d', 'Pilot_Youth1_290',
       'All-net 1000F=(3000F On+3000F Off);5d', 'VAS(IVR_Radio_Daily)',
       'Data:3000F=10GB,30d', 'All-net 1000=5000;5d',
       'Twter_U2opia_Monthly', 'MIXT: 390F=04HOn-net_400SMS_400 Mo;4h\t',
       'FNF2 ( JAPPANTE)', 'Yewouleen_PKG', 'Data:150F=SPPackage1,24H',
       'WIFI_Family_2MBPS', 'Data:500F=2GB,24H', 

In [15]:
data['REGION'].value_counts()

REGION
DAKAR          1362570
THIES           180052
SAINT-LOUIS     119886
LOUGA            99053
KAOLACK          96986
DIOURBEL         66911
TAMBACOUNDA      55074
KAFFRINE         43963
KOLDA            38743
FATICK           35643
MATAM            29083
ZIGUINCHOR       21945
SEDHIOU           3119
KEDOUGOU          1020
Name: count, dtype: int64

In [16]:
values = {}
for i in cat.columns:
  values[i] = cat[i].unique()

In [17]:
values

{'user_id': array(['00000bfd7d50f01092811bc0c8d7b0d6fe7c3596',
        '00000cb4a5d760de88fecb38e2f71b7bec52e834',
        '00001654a9d9f96303d9969d0a4a851714a4bb57', ...,
        'fffff172fda1b4bb38a95385951908bb92379809',
        'fffff5911296937a37f09a37a549da2e0dad6dbb',
        'fffff6dbff1508ea2bfe814e5ab2729ce6b788c2'], dtype=object),
 'REGION': array(['FATICK', nan, 'DAKAR', 'LOUGA', 'TAMBACOUNDA', 'KAOLACK', 'THIES',
        'SAINT-LOUIS', 'KOLDA', 'KAFFRINE', 'DIOURBEL', 'ZIGUINCHOR',
        'MATAM', 'SEDHIOU', 'KEDOUGOU'], dtype=object),
 'TENURE': array(['K > 24 month', 'I 18-21 month', 'G 12-15 month', 'H 15-18 month',
        'J 21-24 month', 'F 9-12 month', 'D 3-6 month', 'E 6-9 month'],
       dtype=object),
 'MRG': array(['NO'], dtype=object),
 'TOP_PACK': array(['On net 200F=Unlimited _call24H', nan, 'On-net 1000F=10MilF;10d',
        'Data:1000F=5GB,7d', 'Mixt 250F=Unlimited_call24H',
        'MIXT:500F= 2500F on net _2500F off net;2d',
        'All-net 500F=2000F;5

In [19]:
region_encoder = LabelEncoder()
data['REGION'] = region_encoder.fit_transform(data['REGION'])

In [20]:
tp_encoder = LabelEncoder()
data['TOP_PACK'] = tp_encoder.fit_transform(data['TOP_PACK'])

In [23]:
data = data.drop('user_id', axis = 1)

In [24]:
x = data.drop(['MRG', 'CHURN'], axis = 1)
y = data['CHURN'].astype('int')

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 42)

In [25]:
model = RandomForestClassifier(n_estimators = 4)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
acc = classification_report(y_test, y_pred)
print(acc)

              precision    recall  f1-score   support

           0       0.91      0.93      0.92    349773
           1       0.66      0.61      0.63     81037

    accuracy                           0.87    430810
   macro avg       0.78      0.77      0.78    430810
weighted avg       0.86      0.87      0.87    430810



In [26]:
import joblib
joblib.dump(model, "rf_model.joblib")
joblib.dump(region_encoder, "region_encoder.joblib")
joblib.dump(tp_encoder, "tp_encoder.joblib")

['tp_encoder.joblib']