In [72]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch import optim

# DATA PREPROCESSING

In [74]:
df = pd.read_csv("data/customer_churn/customer_churn.csv")

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Drop useless columns

In [75]:
df.drop(columns=['customerID'], axis='columns', inplace=True)

In [76]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Deal with na values

In [77]:
df.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### Encode categorical columns

In [78]:
for column in df.columns:
    print(f"{column}: {df[column].unique()}")

gender: ['Female' 'Male']
SeniorCitizen: [0 1]
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
tenure: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService: ['No' 'Yes']
MultipleLines: ['No phone service' 'No' 'Yes']
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: ['No' 'Yes' 'No internet service']
OnlineBackup: ['Yes' 'No' 'No internet service']
DeviceProtection: ['No' 'Yes' 'No internet service']
TechSupport: ['No' 'Yes' 'No internet service']
StreamingTV: ['No' 'Yes' 'No internet service']
StreamingMovies: ['No' 'Yes' 'No internet service']
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: ['Yes' 'No']
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
MonthlyCharges: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalChar

In [79]:
yes_no_columns = []

for column in df.columns:
    if df[column].isin(['Yes']).any() and df[column].isin(['No']).any():
        yes_no_columns.append(column)

In [80]:
yes_no_columns

['Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'PaperlessBilling',
 'Churn']

In [81]:
df[yes_no_columns] = df[yes_no_columns].replace('No internet service', 'No')
df[yes_no_columns] = df[yes_no_columns].replace('No phone service', 'No')

In [82]:
for column in yes_no_columns:
    print(f"{column}: {df[column].unique()}")

Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
PhoneService: ['No' 'Yes']
MultipleLines: ['No' 'Yes']
OnlineSecurity: ['No' 'Yes']
OnlineBackup: ['Yes' 'No']
DeviceProtection: ['No' 'Yes']
TechSupport: ['No' 'Yes']
StreamingTV: ['No' 'Yes']
StreamingMovies: ['No' 'Yes']
PaperlessBilling: ['Yes' 'No']
Churn: ['No' 'Yes']


In [83]:
for column in yes_no_columns:
    df[column].replace({'Yes': 1, 'No': 0}, inplace=True)

In [84]:
df['gender'].replace({'Female': 1, 'Male': 0}, inplace=True)

In [85]:
for column in df.columns:
    print(f"{column}: {df[column].unique()}")

gender: [1 0]
SeniorCitizen: [0 1]
Partner: [1 0]
Dependents: [0 1]
tenure: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService: [0 1]
MultipleLines: [0 1]
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: [0 1]
OnlineBackup: [1 0]
DeviceProtection: [0 1]
TechSupport: [0 1]
StreamingTV: [0 1]
StreamingMovies: [0 1]
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: [1 0]
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
MonthlyCharges: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges: ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn: [0 1]


In [86]:
string_columns = df.select_dtypes(include=['object', 'string']).columns

string_columns

Index(['InternetService', 'Contract', 'PaymentMethod', 'TotalCharges'], dtype='object')

In [87]:
(df['TotalCharges'] == ' ').sum()

11

In [88]:
df = df[df['TotalCharges'] != ' ']

In [89]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

In [90]:
print(f"{column}: {df[column].unique()}")
string_columns = df.select_dtypes(include=['object', 'string']).columns

string_columns

Churn: [0 1]


Index(['InternetService', 'Contract', 'PaymentMethod'], dtype='object')

In [91]:
df = pd.get_dummies(df, columns=string_columns)

In [92]:
for column in df.columns:
    print(f"{column}: {df[column].unique()}")

gender: [1 0]
SeniorCitizen: [0 1]
Partner: [1 0]
Dependents: [0 1]
tenure: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 39]
PhoneService: [0 1]
MultipleLines: [0 1]
OnlineSecurity: [0 1]
OnlineBackup: [1 0]
DeviceProtection: [0 1]
TechSupport: [0 1]
StreamingTV: [0 1]
StreamingMovies: [0 1]
PaperlessBilling: [1 0]
MonthlyCharges: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges: [  29.85 1889.5   108.15 ...  346.45  306.6  6844.5 ]
Churn: [0 1]
InternetService_DSL: [ True False]
InternetService_Fiber optic: [False  True]
InternetService_No: [False  True]
Contract_Month-to-month: [ True False]
Contract_One year: [False  True]
Contract_Two year: [False  True]
PaymentMethod_Bank transfer (automatic): [False  True]
PaymentMethod_Credit card (automatic): [False  True]
PaymentMethod_Electronic check: [ True False]
Pa

In [93]:
for column in df.columns:
    print(f"{column} : {df[column].dtype}")

gender : int64
SeniorCitizen : int64
Partner : int64
Dependents : int64
tenure : int64
PhoneService : int64
MultipleLines : int64
OnlineSecurity : int64
OnlineBackup : int64
DeviceProtection : int64
TechSupport : int64
StreamingTV : int64
StreamingMovies : int64
PaperlessBilling : int64
MonthlyCharges : float64
TotalCharges : float64
Churn : int64
InternetService_DSL : bool
InternetService_Fiber optic : bool
InternetService_No : bool
Contract_Month-to-month : bool
Contract_One year : bool
Contract_Two year : bool
PaymentMethod_Bank transfer (automatic) : bool
PaymentMethod_Credit card (automatic) : bool
PaymentMethod_Electronic check : bool
PaymentMethod_Mailed check : bool


### Scale numeric values

In [94]:
scale_columns = ["tenure", "MonthlyCharges", "TotalCharges"]

In [95]:
scaler = MinMaxScaler()

df[scale_columns] = scaler.fit_transform(df[scale_columns])

In [96]:
for column in scale_columns:
    print(f"{column}: {df[column].unique()}")

tenure: [0.         0.46478873 0.01408451 0.61971831 0.09859155 0.29577465
 0.12676056 0.38028169 0.85915493 0.16901408 0.21126761 0.8028169
 0.67605634 0.33802817 0.95774648 0.71830986 0.98591549 0.28169014
 0.15492958 0.4084507  0.64788732 1.         0.22535211 0.36619718
 0.05633803 0.63380282 0.14084507 0.97183099 0.87323944 0.5915493
 0.1971831  0.83098592 0.23943662 0.91549296 0.11267606 0.02816901
 0.42253521 0.69014085 0.88732394 0.77464789 0.08450704 0.57746479
 0.47887324 0.66197183 0.3943662  0.90140845 0.52112676 0.94366197
 0.43661972 0.76056338 0.50704225 0.49295775 0.56338028 0.07042254
 0.04225352 0.45070423 0.92957746 0.30985915 0.78873239 0.84507042
 0.18309859 0.26760563 0.73239437 0.54929577 0.81690141 0.32394366
 0.6056338  0.25352113 0.74647887 0.70422535 0.35211268 0.53521127]
MonthlyCharges: [0.11542289 0.38507463 0.35422886 ... 0.44626866 0.25820896 0.60149254]
TotalCharges: [0.0012751  0.21586661 0.01031041 ... 0.03780868 0.03321025 0.78764136]


### Train test split

In [97]:
bool_columns = df.select_dtypes(include=['object', 'bool']).columns
df[bool_columns] = df[bool_columns].astype('int')

In [98]:
X = df.drop(['Churn'], axis='columns')
y = df['Churn']

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# NEURAL NETWORK

### Set up device

In [100]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(device)

cuda:0


In [101]:
if device != 'cpu':
    print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 3060 Ti


### Convert df to tensors

In [102]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [103]:
training_data = TensorDataset(X_train_tensor, y_train_tensor)

test_data = TensorDataset(X_test_tensor, y_test_tensor)

In [104]:
batch_size = 32
shuffle = True

In [105]:
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=shuffle)

test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=shuffle)

### Define model

In [106]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.function_stack = nn.Sequential(
            nn.Linear(26,26),
            nn.ReLU(),
            nn.Linear(26,15),
            nn.ReLU(),
            nn.Linear(15,1),
            #nn.Sigmoid()
        )

    def forward(self, x):
        #x = self.flatten(x)
        logits = self.function_stack(x)
        return logits

In [107]:
model = NeuralNetwork().to(device)

print(model)

NeuralNetwork(
  (function_stack): Sequential(
    (0): Linear(in_features=26, out_features=26, bias=True)
    (1): ReLU()
    (2): Linear(in_features=26, out_features=15, bias=True)
    (3): ReLU()
    (4): Linear(in_features=15, out_features=1, bias=True)
  )
)


In [108]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

loss_fn = nn.BCEWithLogitsLoss()

In [109]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)

    model.train()

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        y = y.view(-1, 1).float()

        logits = model(X)
        loss = loss_fn(logits, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [110]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    model.eval()
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            y = y.view(-1, 1).float()

            logits = model(X)
            test_loss += loss_fn(logits, y).item()
            pred = torch.sigmoid(logits)

            predicted_classes = (pred > 0.5).float()   # 0 or 1
            correct += (predicted_classes == y).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

### Train

In [111]:
epochs = 100
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
Test Error: 
 Accuracy: 78.4%, Avg loss: 0.451188 

Epoch 2
-------------------------------
Test Error: 
 Accuracy: 78.5%, Avg loss: 0.441706 

Epoch 3
-------------------------------
Test Error: 
 Accuracy: 79.2%, Avg loss: 0.436055 

Epoch 4
-------------------------------
Test Error: 
 Accuracy: 78.7%, Avg loss: 0.435152 

Epoch 5
-------------------------------
Test Error: 
 Accuracy: 78.5%, Avg loss: 0.436493 

Epoch 6
-------------------------------
Test Error: 
 Accuracy: 79.3%, Avg loss: 0.433357 

Epoch 7
-------------------------------
Test Error: 
 Accuracy: 79.1%, Avg loss: 0.433291 

Epoch 8
-------------------------------
Test Error: 
 Accuracy: 78.7%, Avg loss: 0.435823 

Epoch 9
-------------------------------
Test Error: 
 Accuracy: 79.2%, Avg loss: 0.433969 

Epoch 10
-------------------------------
Test Error: 
 Accuracy: 79.2%, Avg loss: 0.432450 

Epoch 11
-------------------------------
Test Error: 
 Accuracy: 78.7%, Avg los