In [8]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, auc, confusion_matrix
import psutil

In [2]:
from google.colab import drive
try:
        drive.mount('/content/drive')
except Exception as e:
        print(f"An error occurred during mounting: {e}")

import zipfile
zip_path = '/content/drive/MyDrive/KaggleV2-May-2016.csv.zip'
extract_path = '/content/drive/MyDrive/'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

Mounted at /content/drive


In [3]:
class Model(nn.Module):
  def __init__(self, in_features, h1, h2, out_features):
    super().__init__()
    self.fc1 = nn.Linear(in_features, h1)
    self.fc2 = nn.Linear(h1, h2)
    self.fc3 = nn.Linear(h2, out_features)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.softmax(self.fc3(x),dim = 1)
    return x


In [29]:
df = pd.read_csv('/content/drive/MyDrive/KaggleV2-May-2016.csv')


df=df.drop(columns=['PatientId','AppointmentID','ScheduledDay','AppointmentDay',])
df.rename(columns={'No-show':'given'},inplace = True)
df['given'] = df['given'].map({'No':0,'Yes':1})
df['Gender'] = df['Gender'].map({'F':0,'M':1})

df = pd.get_dummies(df, columns=["Neighbourhood"])

features = ['Gender', 'Age', 'Scholarship', 'Hipertension',
            'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received'] + [col for col in df.columns if col.startswith("Neighbourhood_")]

X = df[features].values
y = df['given'].values

SS_X = StandardScaler()
X = SS_X.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

input_size = X_train.shape[1]
model=Model(input_size,64,32,2)

class_counts = np.bincount(y_train.numpy())
class_weights = torch.tensor([1.0 / class_counts[0], 1.0 / class_counts[1]], dtype=torch.float32)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimiser = optim.Adam(model.parameters(), lr=0.0001)

epochs = 200
losses = []

for i in range(epochs):
  y_pred = model.forward(X_train)
  loss = criterion(y_pred, y_train)
  losses.append(loss.detach().numpy())
  if i % 10 == 0:
    print(f'Epoch {i} loss is {loss}')
  optimiser.zero_grad()
  loss.backward()
  optimiser.step()


Epoch 0 loss is 0.6941185593605042
Epoch 10 loss is 0.6935303211212158
Epoch 20 loss is 0.6930410265922546
Epoch 30 loss is 0.6926124095916748
Epoch 40 loss is 0.692211389541626
Epoch 50 loss is 0.69181227684021
Epoch 60 loss is 0.6914030313491821
Epoch 70 loss is 0.6909701228141785
Epoch 80 loss is 0.6905131936073303
Epoch 90 loss is 0.6900309920310974
Epoch 100 loss is 0.6895164251327515
Epoch 110 loss is 0.6889621019363403
Epoch 120 loss is 0.688359797000885
Epoch 130 loss is 0.6877023577690125
Epoch 140 loss is 0.6869978904724121
Epoch 150 loss is 0.6862664818763733
Epoch 160 loss is 0.6855141520500183
Epoch 170 loss is 0.6847543716430664
Epoch 180 loss is 0.6839889883995056
Epoch 190 loss is 0.6832241415977478


In [30]:
test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

In [31]:
def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)

            # Handle binary or multiclass classification based on output shape
            if outputs.shape[1] > 1: # If output shape is greater than 1, it's likely multiclass or binary with two outputs
                # For models with 2 outputs for binary classification (like yours with softmax)
                # The predicted class is the index with the highest probability
                probs = outputs # Since your model uses softmax, outputs are already probabilities
                preds = torch.argmax(outputs, dim=1) # Get the index of the max probability

            elif outputs.shape[1] == 1: # This case is for models with a single output for binary classification (e.g., using sigmoid)
                probs = torch.sigmoid(outputs).squeeze()
                preds = (probs > 0.5).int()
            else:
                 raise ValueError(f"Unexpected output shape: {outputs.shape}")


            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            # For probabilities, if it's binary with two outputs, we often care about the probability of the positive class (index 1)
            if outputs.shape[1] > 1:
                # Assuming class 1 is the positive class
                all_probs.extend(probs[:, 1].cpu().numpy())
            elif outputs.shape[1] == 1:
                 all_probs.extend(probs.cpu().numpy())


    return np.array(all_preds), np.array(all_labels), np.array(all_probs)

In [32]:
def calculate_metrics(y_true, y_pred, y_probs, multiclass=False):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted' if multiclass else 'binary')
    cm = confusion_matrix(y_true, y_pred)

    if not multiclass:
        precision, recall, _ = precision_recall_curve(y_true, y_probs)
        pr_auc = auc(recall, precision)
    return acc, f1, pr_auc, cm


In [33]:
def get_memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    memory_used_mb = mem_info.rss / 1024 ** 2
    return memory_used_mb

In [34]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
y_pred, y_true, y_probs = evaluate_model(model, test_loader, device)

multiclass = len(np.unique(y_true)) > 2
pr_auc_input = y_probs if not multiclass else None
acc, f1, pr_auc, cm = calculate_metrics(y_true, y_pred, pr_auc_input, multiclass)

mem_used = get_memory_usage()

print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
if pr_auc is not None:
    print(f"PR AUC: {pr_auc:.4f}")
print("Confusion Matrix:")
print(cm)
print(f"Memory Usage: {mem_used:.2f} MB")


Accuracy: 0.5606
F1 Score: 0.3434
PR AUC: 0.2645
Confusion Matrix:
[[9852 7817]
 [1897 2540]]
Memory Usage: 2095.13 MB
