In [499]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

In [513]:
from datasets import load_dataset

dataset = load_dataset("scikit-learn/churn-prediction", split="train").to_csv("churn.csv")

Creating CSV from Arrow format: 100%|██████████| 8/8 [00:00<00:00, 263.49ba/s]


Customer churn prediction dataset of a fictional telecommunication company made by IBM Sample Datasets. Context Predict behavior to retain customers. You can analyze all relevant customer data and develop focused customer retention programs.

Content Each row represents a customer, each column contains customer’s attributes described on the column metadata.

The data set includes information about:

Customers who left within the last month: the column is called Churn
Services that each customer has signed up for: phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
Customer account information: how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges
Demographic info about customers: gender, age range, and if they have partners and dependents

In [519]:
data = pd.read_csv("churn.csv")
data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [520]:
labels = LabelEncoder().fit_transform(data['Churn'])
print(labels)

data.drop(columns=["customerID", "Churn"], axis="columns", inplace=True)

print(data.columns)
print(len(data.columns))

[0 0 1 ... 0 1 0]
Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')
19


In [521]:
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6


In [504]:
uniqGender = data['gender'].unique()
uniqSeniorCitizen = data['SeniorCitizen'].unique()
uniqPartner = data['Partner'].unique()
uniqDependents = data['Dependents'].unique()
# uniqTenure = data['tenure'].unique() # ! TENURE IS NOT CATEGORY
uniqPhoneService = data['PhoneService'].unique()
uniqMultipleLines = data['MultipleLines'].unique()
uniqInternetService = data['InternetService'].unique()
uniqOnlineSecurity = data['OnlineSecurity'].unique()
uniqOnlineBackup = data['OnlineBackup'].unique()
uniqDeviceProtection = data['DeviceProtection'].unique()
uniqTechSupport = data['TechSupport'].unique()
uniqStreamingTV = data['StreamingTV'].unique()
uniqStreamingMovies = data['StreamingMovies'].unique()
uniqContract = data['Contract'].unique()
uniqPaperlessBilling = data['PaperlessBilling'].unique()
uniqPaymentMethod = data['PaymentMethod'].unique()
# uniqMonthlyCharges = data['MonthlyCharges'].unique()  # ! IS NOT CATEGORY
# uniqTotalCharges = data['TotalCharges'].unique() # ! IS NOT CATEGORY
uniqChurn = data['Churn'].unique()

In [522]:
data.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [523]:
one_hot_encoded = pd.get_dummies(data, columns=[
    'gender', 'SeniorCitizen', 'Partner', 'Dependents',
    'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
    'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
    'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], dtype=int)

# DONT RE RUN HERE

In [524]:
data = one_hot_encoded

data.to_csv("checkpoint.csv")

In [525]:
labels

array([0, 0, 1, ..., 0, 1, 0])

In [534]:
data = pd.read_csv('checkpoint.csv')

In [535]:
data

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,29.85,29.85,1,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
1,34,56.95,1889.50,0,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,2,53.85,108.15,0,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,45,42.30,1840.75,0,1,1,0,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,2,70.70,151.65,1,0,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,24,84.80,1990.50,0,1,1,0,0,1,0,...,1,0,1,0,0,1,0,0,0,1
7039,72,103.20,7362.90,1,0,1,0,0,1,0,...,1,0,1,0,0,1,0,1,0,0
7040,11,29.60,346.45,1,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,1,0
7041,4,74.40,306.60,0,1,0,1,0,1,1,...,0,1,0,0,0,1,0,0,0,1


# need to normalize tenure, monthly charges, total charges

In [536]:
scaler = MinMaxScaler()
data = scaler.fit_transform(data)

In [538]:
data

array([[0.01388889, 0.11542289, 0.0012751 , ..., 0.        , 1.        ,
        0.        ],
       [0.47222222, 0.38507463, 0.21586661, ..., 0.        , 0.        ,
        1.        ],
       [0.02777778, 0.35422886, 0.01031041, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.15277778, 0.11293532, 0.03780868, ..., 0.        , 1.        ,
        0.        ],
       [0.05555556, 0.55870647, 0.03321025, ..., 0.        , 0.        ,
        1.        ],
       [0.91666667, 0.86965174, 0.78764136, ..., 0.        , 0.        ,
        0.        ]])

In [539]:
torch.save(data, "tensors.pt")

In [540]:
loaded_tensor = torch.load('tensors.pt')

In [541]:
loaded_tensor

array([[0.01388889, 0.11542289, 0.0012751 , ..., 0.        , 1.        ,
        0.        ],
       [0.47222222, 0.38507463, 0.21586661, ..., 0.        , 0.        ,
        1.        ],
       [0.02777778, 0.35422886, 0.01031041, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.15277778, 0.11293532, 0.03780868, ..., 0.        , 1.        ,
        0.        ],
       [0.05555556, 0.55870647, 0.03321025, ..., 0.        , 0.        ,
        1.        ],
       [0.91666667, 0.86965174, 0.78764136, ..., 0.        , 0.        ,
        0.        ]])

## data is ready now

In [544]:
loaded_tensor.shape

(7043, 46)

In [576]:
X = torch.tensor(data)
X

tensor([[0.0139, 0.1154, 0.0013,  ..., 0.0000, 1.0000, 0.0000],
        [0.4722, 0.3851, 0.2159,  ..., 0.0000, 0.0000, 1.0000],
        [0.0278, 0.3542, 0.0103,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [0.1528, 0.1129, 0.0378,  ..., 0.0000, 1.0000, 0.0000],
        [0.0556, 0.5587, 0.0332,  ..., 0.0000, 0.0000, 1.0000],
        [0.9167, 0.8697, 0.7876,  ..., 0.0000, 0.0000, 0.0000]],
       dtype=torch.float64)

In [697]:
class ChurnModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.l1 = nn.Linear(in_features=46, out_features=64)
        self.l2 = nn.Linear(in_features=64, out_features=1)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.l2(self.relu(self.l1(x)))

In [564]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [698]:
ChurnModel_Inst1 = ChurnModel().to(device)

ChurnModel_Inst1

ChurnModel(
  (l1): Linear(in_features=46, out_features=64, bias=True)
  (l2): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
)

In [699]:
ChurnModel_Inst1.eval()
with torch.inference_mode():
    untrained = ChurnModel_Inst1(X.to(device=device, dtype=torch.float32))

print(untrained)
print(len(untrained))
# made 7043 predictions based on the data provided

tensor([[-0.0759],
        [-0.2436],
        [-0.0564],
        ...,
        [ 0.0199],
        [-0.1982],
        [-0.1632]], device='cuda:0')
7043


In [700]:
loss_fn = nn.BCEWithLogitsLoss() # using this instead of BCELoss to put the sigmoid activation function (adds non-linearity to the model for better learning) in one operation
optimizer = torch.optim.SGD(params=ChurnModel_Inst1.parameters(), lr=0.00000000000000000000000000000000000001)

In [701]:
tensor_labels = torch.tensor(labels, dtype=torch.float32)
tensor_labels

tensor([0., 0., 1.,  ..., 0., 1., 0.])

In [702]:
# SPLIT THE DATA

X_train, X_test, y_train, y_test = train_test_split(X, tensor_labels, test_size=0.2, random_state=7)

X_train = X_train.to(device=device, dtype=torch.float32)
X_test = X_test.to(device=device, dtype=torch.float32)
y_train = y_train.to(device=device, dtype=torch.float32)
y_test = y_test.to(device=device, dtype=torch.float32)
len(X_train), len(X_test), len(y_train), len(y_test)

(5634, 1409, 5634, 1409)

In [703]:
X_train.dtype

torch.float32

In [704]:
# get the accuracy of it
testTens=torch.arange(1,11,step=1)
testTens2=torch.arange(1,20,step=2)

print(testTens)
print(testTens2)

def accuracy_fn(truth, pred):
    correct = torch.eq(truth, pred).sum().item()
    percentage = correct / len(pred) * 100
    print(f"accuracy: {percentage} %")

accuracy_fn(testTens, testTens2)

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
tensor([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19])
accuracy: 10.0 %


In [705]:
epochs = 1

In [706]:
for epoch in range(epochs):

    print(epoch)

    ChurnModel_Inst1.train()

    out_logits = ChurnModel_Inst1(X_train).squeeze()

    out_preds = torch.round(torch.sigmoid(out_logits))

    print(out_logits)
    print(out_preds)

    loss = loss_fn(out_logits, y_train)

    print(loss)

    accuracy = accuracy_fn(out_preds, y_train)

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()
    
    ChurnModel_Inst1.eval()
    with torch.inference_mode():
        test_out_logits = ChurnModel_Inst1(X_test.to(device=device, dtype=torch.float32)).squeeze()
        test_out_preds = torch.round(torch.sigmoid(out_logits))

        print(test_out_logits)
        print(test_out_preds)

        loss = loss_fn(test_out_logits, y_test)

        print(loss)

        accuracy = accuracy_fn(out_preds, y_train)

    print(untrained)
    print(len(untrained))

0
tensor([-0.1896, -0.0240, -0.0570,  ..., -0.0696, -0.3132, -0.1660],
       device='cuda:0', grad_fn=<SqueezeBackward0>)
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0',
       grad_fn=<RoundBackward0>)
tensor(nan, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
accuracy: 70.99751508697196 %
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0')
tensor(nan, device='cuda:0')
accuracy: 70.99751508697196 %
tensor([[-0.0759],
        [-0.2436],
        [-0.0564],
        ...,
        [ 0.0199],
        [-0.1982],
        [-0.1632]], device='cuda:0')
7043


## WE GETTING HIGHER ACCURACY LETS FUCKING TGOOOOOOOO

just need to figure out why its getting nan after the first epoch...

In [751]:
# save model

from pathlib import Path

path = Path('saved_models')    

def get_latest_file():
    
    filelist = list(path.iterdir())

    filelistnums = []

    for file in filelist:

        filename = str(file).split("_")

        print(f"filename[1] {filename[2]}")

        filelistnums.append(int(filename[2]))
        

    return max(filelistnums)


def save_model(model2save, pathname, modelpathname):

    namenum = str(get_latest_file())

    MODEL_PATH = Path(pathname)
    MODEL_PATH.mkdir(parents=True, exist_ok=True)
    MODEL_NAME = f"{modelpathname + namenum}.pt"
    MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

    print(f"saving {MODEL_NAME}")
    torch.save(obj=model2save.state_dict(), f=MODEL_SAVE_PATH)

In [752]:
save_model(ChurnModel_Inst1, pathname="saved_models", modelpathname="best")

filename[1] 1
saving best1.pt


In [753]:
ChurnModel_Inst2 = ChurnModel()

ChurnModel_Inst2.load_state_dict(torch.load(f="saved_models/best1.pt"))

<All keys matched successfully>