In [42]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn


In [43]:
# dataframe flag to make first column as index
df = pd.read_csv("dataset/telescope_data.csv", index_col=0)

In [44]:
df.head()


Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [45]:
g_count = df["class"].value_counts()["g"]
h_count = df["class"].value_counts()["h"]

g_count, h_count

(12332, 6688)

In [46]:
# randomly select h_count number of rows from g class
g_df = df[df["class"] == "g"].sample(h_count)
h_df = df[df["class"] == "h"]

# concat the g and h dataframes
balanced_df = pd.concat([g_df, h_df])

# size of the balanced dataframe
balanced_df.shape

(13376, 11)

In [47]:
# split the dataset randomly into 70% training and 30% testing
train_df, test_df = balanced_df.sample(frac=0.7), balanced_df.drop(
    balanced_df.sample(frac=0.7).index
)
# split the training dataset into 80% training and 20% validation
train_df, val_df = train_df.sample(frac=0.8), train_df.drop(
    train_df.sample(frac=0.8).index
)


In [48]:
# split train, test and validation data into features and labels
train_features, train_labels = train_df.drop(
    ["class"], axis=1), train_df["class"]
val_features, val_labels = val_df.drop(["class"], axis=1), val_df["class"]
test_features, test_labels = test_df.drop(["class"], axis=1), test_df["class"]


In [49]:
# encode the labels
train_labels = train_labels.map({"g": 0, "h": 1})
val_labels = val_labels.map({"g": 0, "h": 1})
test_labels = test_labels.map({"g": 0, "h": 1})

In [50]:
# create a model with 2 hidden layers
class Model(nn.Module):
    def __init__(self, input_size, h1_size, h2_size, output_size):
        super().__init__()
        self.linear1 = nn.Linear(input_size, h1_size)
        self.linear2 = nn.Linear(h1_size, h2_size)
        self.linear3 = nn.Linear(h2_size, output_size)

    def forward(self, x):
        x = torch.sigmoid(self.linear1(x))
        x = torch.sigmoid(self.linear2(x))
        x = torch.sigmoid(self.linear3(x))

        return x

In [51]:
# hyperparameters

input_size = 10
h1_size = 5
h2_size = 5
output_size = 1
num_epochs = 100
batch_size = 128
learning_rate = 0.0001

In [52]:
from torch.utils.data import Dataset, DataLoader


class TelescopeDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        self.n_samples = len(labels)

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

    def __len__(self):
        return self.n_samples


train_dataset = TelescopeDataset(train_features.values, train_labels.values)
val_dataset = TelescopeDataset(val_features.values, val_labels.values)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True)

In [53]:
model = Model(input_size, h1_size, h2_size, output_size)

# loss and optimizer
criterion = nn.BCELoss()  # binary cross entropy loss (for binary classification)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# training loop
model.train()
for epoch in range(num_epochs):
    # add validation loop
    for i, (inputs, labels) in enumerate(train_loader):
        # forward pass
        optimizer.zero_grad()
        outputs = model(inputs.float())
        loss = criterion(outputs, labels.float().unsqueeze(1))

        # backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 5 == 0:
            print(
                f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}"
            )

    # validation loop
    with torch.no_grad():
        n_correct = 0
        n_samples = 0
        model.eval()
        for inputs, labels in val_loader:
            optimizer.zero_grad()
            outputs = model(inputs.float())
            predicted = torch.round(outputs)
            n_samples += labels.size(0)
            n_correct += (predicted == labels.float().unsqueeze(1)).sum().item()

        acc = 100.0 * n_correct / n_samples
        print(f"Accuracy of the network on the {n_samples} validation samples: {acc} %")

Epoch [1/100], Step [5/59], Loss: 0.6930
Epoch [1/100], Step [10/59], Loss: 0.7139
Epoch [1/100], Step [15/59], Loss: 0.7094
Epoch [1/100], Step [20/59], Loss: 0.7043
Epoch [1/100], Step [25/59], Loss: 0.7094
Epoch [1/100], Step [30/59], Loss: 0.6966
Epoch [1/100], Step [35/59], Loss: 0.7080
Epoch [1/100], Step [40/59], Loss: 0.6914
Epoch [1/100], Step [45/59], Loss: 0.7084
Epoch [1/100], Step [50/59], Loss: 0.7049


Epoch [1/100], Step [55/59], Loss: 0.6969
Accuracy of the network on the 1873 validation samples: 51.09450080085424 %
Epoch [2/100], Step [5/59], Loss: 0.6889
Epoch [2/100], Step [10/59], Loss: 0.6956
Epoch [2/100], Step [15/59], Loss: 0.6991
Epoch [2/100], Step [20/59], Loss: 0.6921
Epoch [2/100], Step [25/59], Loss: 0.6914
Epoch [2/100], Step [30/59], Loss: 0.7085
Epoch [2/100], Step [35/59], Loss: 0.6984
Epoch [2/100], Step [40/59], Loss: 0.6922
Epoch [2/100], Step [45/59], Loss: 0.6963
Epoch [2/100], Step [50/59], Loss: 0.6982
Epoch [2/100], Step [55/59], Loss: 0.7042
Accuracy of the network on the 1873 validation samples: 51.09450080085424 %
Epoch [3/100], Step [5/59], Loss: 0.6849
Epoch [3/100], Step [10/59], Loss: 0.6917
Epoch [3/100], Step [15/59], Loss: 0.6904
Epoch [3/100], Step [20/59], Loss: 0.6949
Epoch [3/100], Step [25/59], Loss: 0.7080
Epoch [3/100], Step [30/59], Loss: 0.7035
Epoch [3/100], Step [35/59], Loss: 0.7063
Epoch [3/100], Step [40/59], Loss: 0.6918
Epoch [3/1

In [54]:
# test the model

test_dataset = TelescopeDataset(test_features.values, test_labels.values)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for inputs, labels in test_loader:
        optimizer.zero_grad()
        outputs = model(inputs.float())
        predicted = torch.round(outputs)
        n_samples += labels.size(0)
        n_correct += (predicted == labels.float().unsqueeze(1)).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f"Accuracy of the network on the {n_samples} test samples: {acc} %")

Accuracy of the network on the 4013 test samples: 74.63244455519562 %
