### Import Libraries

In [454]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Import Datasets

In [455]:
train_dataset = pd.read_csv("customer_churn_dataset-training-master.csv")
test_dataset = pd.read_csv("customer_churn_dataset-testing-master.csv")

### Finding Empty Values (Training)

In [456]:
train_dataset.isnull().sum()

CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64

### Finding Empty Values (Testing)

In [457]:
test_dataset.isnull().sum()

CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64

### Create Preprocessing Pipeline

In [458]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

num_cols = [1,3, 4, 5, 6, 9]
num_cols = ["Age", "Tenure", "Usage Frequency", "Support Calls", "Payment Delay", "Total Spend", "Last Interaction"]
cat_cols = [2, 7, 8, 10]
cat_cols = ["Subscription Type", "Contract Length"]
gender_cols = ["Gender"]

num_preprocessor = Pipeline(steps=[
    # ("num_imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_preprocessor = Pipeline(steps=[
    # ("cat_imputer", SimpleImputer(strategy="most_frequent", fill_value="missing")),
    ("label", OrdinalEncoder())
])

gender_preprocessor = Pipeline(steps=[
    # ("gender_imputer", SimpleImputer(strategy="most_frequent", fill_value="missing")),
    ("onehot", OneHotEncoder(drop="first"))
])

pre_transformer = ColumnTransformer(
    transformers=[
        ("mean_imputer", num_preprocessor , num_cols),
        ("gender_imputer", gender_preprocessor, gender_cols),
        ("mode_imputer", cat_preprocessor, cat_cols)
    ]
)




In [459]:
# train_dataset = pre_transformer.fit_transform(train_dataset)
# test_dataset = pre_transformer.fit_transform(test_dataset)
print(train_dataset.iloc[0, :])
print(train_dataset.iloc[1, :])
print(train_dataset.iloc[2, :])
print(train_dataset.iloc[3, :])
print(train_dataset.iloc[4, :])

# print(pre_transformer.get_feature_names_out())

CustomerID                  2
Age                        30
Gender                 Female
Tenure                     39
Usage Frequency            14
Support Calls               5
Payment Delay              18
Subscription Type    Standard
Contract Length        Annual
Total Spend             932.0
Last Interaction           17
Churn                       1
Name: 0, dtype: object
CustomerID                 3
Age                       65
Gender                Female
Tenure                    49
Usage Frequency            1
Support Calls             10
Payment Delay              8
Subscription Type      Basic
Contract Length      Monthly
Total Spend            557.0
Last Interaction           6
Churn                      1
Name: 1, dtype: object
CustomerID                   4
Age                         55
Gender                  Female
Tenure                      14
Usage Frequency              4
Support Calls                6
Payment Delay               18
Subscription Type        Basi

### Splitting between x and y

In [460]:
x_train = train_dataset.iloc[:, 1:-1]
y_train = train_dataset.iloc[:, -1]
x_test = test_dataset.iloc[:, 1:-1]
y_test = test_dataset.iloc[:, -1]

In [461]:
print(x_train)
print(y_train)

        Age  Gender  Tenure  Usage Frequency  Support Calls  Payment Delay  \
0        30  Female      39               14              5             18   
1        65  Female      49                1             10              8   
2        55  Female      14                4              6             18   
3        58    Male      38               21              7              7   
4        23    Male      32               20              5              8   
...     ...     ...     ...              ...            ...            ...   
440827   42    Male      54               15              1              3   
440828   25  Female       8               13              1             20   
440829   26    Male      35               27              1              5   
440830   28    Male      55               14              2              0   
440831   31    Male      48               20              1             14   

       Subscription Type Contract Length  Total Spend  Last Int

### Creating Classifier Pipeline

In [462]:
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline(steps=[
    ("preprocessor", pre_transformer),
    ("classifier", LogisticRegression())
])

In [463]:
pipeline.fit(x_train,y_train)

### Predicting the Test Results

In [464]:
y_pred = pd.Series(pipeline.predict(x_test))

In [465]:
df = pd.DataFrame({"Predicted": y_pred, "Actual": y_test})

In [466]:
pd.set_option('display.max_rows', 300)

print(df.head(100))

    Predicted  Actual
0           1       1
1           1       0
2           1       0
3           1       0
4           1       0
5           1       0
6           1       1
7           1       0
8           1       0
9           0       0
10          1       1
11          1       0
12          0       0
13          1       0
14          1       0
15          1       1
16          1       0
17          1       0
18          1       0
19          1       0
20          0       0
21          0       0
22          0       0
23          1       0
24          1       0
25          1       1
26          1       0
27          1       0
28          1       0
29          1       1
30          1       0
31          0       0
32          0       0
33          0       0
34          1       1
35          1       0
36          1       0
37          1       1
38          1       0
39          1       0
40          1       1
41          1       0
42          1       1
43          1       0
44        

In [467]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 7808 26073]
 [  532 29961]]


0.5867120265945879