In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
expresso = pd.read_csv('Expresso_churn_dataset.csv')

In [6]:
expresso.head()

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,K > 24 month,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,NO,54,On net 200F=Unlimited _call24H,8.0,0
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,,I 18-21 month,,,,,,,,,,,,NO,4,,,1
2,00001654a9d9f96303d9969d0a4a851714a4bb57,,K > 24 month,3600.0,2.0,1020.0,340.0,2.0,,90.0,46.0,7.0,,,NO,17,On-net 1000F=10MilF;10d,1.0,0
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,K > 24 month,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,,,NO,62,"Data:1000F=5GB,7d",11.0,0
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,K > 24 month,1000.0,1.0,985.0,328.0,1.0,,39.0,24.0,,,,NO,11,Mixt 250F=Unlimited_call24H,2.0,0


In [7]:
expresso.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2154048 entries, 0 to 2154047
Data columns (total 19 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         object 
 1   REGION          object 
 2   TENURE          object 
 3   MONTANT         float64
 4   FREQUENCE_RECH  float64
 5   REVENUE         float64
 6   ARPU_SEGMENT    float64
 7   FREQUENCE       float64
 8   DATA_VOLUME     float64
 9   ON_NET          float64
 10  ORANGE          float64
 11  TIGO            float64
 12  ZONE1           float64
 13  ZONE2           float64
 14  MRG             object 
 15  REGULARITY      int64  
 16  TOP_PACK        object 
 17  FREQ_TOP_PACK   float64
 18  CHURN           int64  
dtypes: float64(12), int64(2), object(5)
memory usage: 312.2+ MB


In [6]:
expresso.REGION.unique()

array(['FATICK', nan, 'DAKAR', 'LOUGA', 'TAMBACOUNDA', 'KAOLACK', 'THIES',
       'SAINT-LOUIS', 'KOLDA', 'KAFFRINE', 'DIOURBEL', 'ZIGUINCHOR',
       'MATAM', 'SEDHIOU', 'KEDOUGOU'], dtype=object)

In [8]:
expresso.isnull().sum()

user_id                 0
REGION             849299
TENURE                  0
MONTANT            756739
FREQUENCE_RECH     756739
REVENUE            726048
ARPU_SEGMENT       726048
FREQUENCE          726048
DATA_VOLUME       1060433
ON_NET             786675
ORANGE             895248
TIGO              1290016
ZONE1             1984327
ZONE2             2017224
MRG                     0
REGULARITY              0
TOP_PACK           902594
FREQ_TOP_PACK      902594
CHURN                   0
dtype: int64

In [9]:
categorical_columns = expresso.select_dtypes(include=['object', 'category']).columns
numerical_columns = expresso.select_dtypes(include=['number']).columns

In [10]:
for col in categorical_columns:
    expresso[col]=expresso[col].fillna(expresso[col].mode()[0])

for col in numerical_columns:
   expresso[col]=expresso[col].fillna(expresso[col].mean())

In [11]:
print(expresso.isnull().sum())

user_id           0
REGION            0
TENURE            0
MONTANT           0
FREQUENCE_RECH    0
REVENUE           0
ARPU_SEGMENT      0
FREQUENCE         0
DATA_VOLUME       0
ON_NET            0
ORANGE            0
TIGO              0
ZONE1             0
ZONE2             0
MRG               0
REGULARITY        0
TOP_PACK          0
FREQ_TOP_PACK     0
CHURN             0
dtype: int64


In [12]:
label_encoder = preprocessing.LabelEncoder() 
for i in expresso.columns:
    if expresso[i].dtype == 'object':
        expresso[i]= label_encoder.fit_transform(expresso[i])

In [13]:
expresso.columns

Index(['user_id', 'REGION', 'TENURE', 'MONTANT', 'FREQUENCE_RECH', 'REVENUE',
       'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO',
       'ZONE1', 'ZONE2', 'MRG', 'REGULARITY', 'TOP_PACK', 'FREQ_TOP_PACK',
       'CHURN'],
      dtype='object')

In [14]:
features = ['REGION', 'TENURE', 'MONTANT', 'FREQUENCE_RECH', 'REVENUE',
       'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 'ON_NET',
       'MRG', 'REGULARITY', 'TOP_PACK', 'FREQ_TOP_PACK']

In [15]:
x = expresso[features].values
y = expresso['CHURN'].values

In [16]:
scaler = StandardScaler()
x = scaler.fit_transform(x)
x

array([[-1.80508432e-01,  2.11319444e-01, -2.23850395e-01, ...,
         1.16469994e+00,  1.90461222e+00, -1.35941175e-01],
       [-6.44660968e-01, -2.78369137e+00,  4.76377933e-16, ...,
        -1.07877529e+00, -6.45781959e-01,  1.89773947e-16],
       [-6.44660968e-01,  2.11319444e-01, -3.37336729e-01, ...,
        -4.95471734e-01,  1.93326834e+00, -8.83773816e-01],
       ...,
       [-6.44660968e-01,  2.11319444e-01,  4.76377933e-16, ...,
        -1.21338381e+00, -6.45781959e-01,  1.89773947e-16],
       [ 2.14025424e+00,  2.11319444e-01,  7.80067169e-01, ...,
        -1.90718225e-03, -6.45781959e-01,  2.91391763e-01],
       [-6.44660968e-01,  2.11319444e-01,  4.76377933e-16, ...,
         1.52365598e+00, -6.45781959e-01,  1.89773947e-16]])

In [17]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=30)

In [18]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)

In [19]:
y_pred = model.predict(x_test)


accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', class_report)

Accuracy: 0.8679998143032892
Confusion Matrix:
 [[320709  29027]
 [ 27840  53234]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92    349736
           1       0.65      0.66      0.65     81074

    accuracy                           0.87    430810
   macro avg       0.78      0.79      0.79    430810
weighted avg       0.87      0.87      0.87    430810



STOP HERE!

In [20]:
import pypickle

filename = 'model.pkl'
pypickle.save(filename, model)

[pypickle] File already exists and is not overwritten: [model.pkl]


False