<a href="https://colab.research.google.com/github/Aakriti555/Nammi-Assignment2/blob/main/NAAMI_Task_2_Feedforward_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, recall_score, f1_score, confusion_matrix
)

In [None]:
import gdown
gdown.download(url="https://drive.google.com/file/d/1Zsg7ZiTWcpvm9IZl72z0DnOiNFu4QgGo/view", output="file.zip", fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1Zsg7ZiTWcpvm9IZl72z0DnOiNFu4QgGo
To: /content/file.zip
100%|██████████| 8.66M/8.66M [00:00<00:00, 43.8MB/s]


'file.zip'

In [None]:
!unzip "/content/file.zip" -d "/content"

Archive:  /content/file.zip
replace /content/TASK_2/blinded_test_set.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: /content/TASK_2/blinded_test_set.csv  
  inflating: /content/__MACOSX/TASK_2/._blinded_test_set.csv  
  inflating: /content/TASK_2/train_set.csv  
  inflating: /content/__MACOSX/TASK_2/._train_set.csv  
  inflating: /content/TASK_2/test_set.csv  
  inflating: /content/__MACOSX/TASK_2/._test_set.csv  


In [None]:
# Load the CSV files
train_df = pd.read_csv("/content/TASK_2/train_set.csv")
test_df = pd.read_csv("/content/TASK_2/test_set.csv")
blinded_test_df = pd.read_csv("/content/TASK_2/blinded_test_set.csv")

In [None]:
# Drop rows with missing target
train_df = train_df.dropna(subset=["CLASS"])
test_df = test_df.dropna(subset=["CLASS"])

In [None]:
train_df.drop(columns=["ID"], inplace=True)
test_df.drop(columns=["ID"], inplace=True)
blinded_test_df.drop(columns=["ID"], inplace=True)

In [None]:
# Remove rows with nan

train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
train_df.dropna(inplace=True)
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df.dropna(inplace=True)

In [None]:
# Separate features and labels
X_train = train_df.drop("CLASS", axis=1)
y_train = train_df["CLASS"]

X_test = test_df.drop("CLASS", axis=1)
y_test = test_df["CLASS"]

In [None]:
# Align columns (in case of mismatched one-hot encodings)
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)


In [None]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)

In [None]:
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

In [None]:
# Define Neural Network


class FeedforwardNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # for binary classification
        )

    def forward(self, x):
        return self.net(x)

model = FeedforwardNN(X_train_tensor.shape[1])

In [None]:
# Cross-validation setup
n_folds = 3
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [None]:
# Store predictions from each fold
cv_predictions_test = []
cv_predictions_blinded = []
cv_metrics = []

In [None]:
# Train the model

criterion = nn.BCELoss()  # Binary Cross-Entropy
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 100

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(X_train_tensor)
    loss = criterion(output, y_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")


Epoch 10/100, Loss: 0.4216
Epoch 20/100, Loss: 0.2131
Epoch 30/100, Loss: 0.0728
Epoch 40/100, Loss: 0.0179
Epoch 50/100, Loss: 0.0051
Epoch 60/100, Loss: 0.0022
Epoch 70/100, Loss: 0.0013
Epoch 80/100, Loss: 0.0010
Epoch 90/100, Loss: 0.0008
Epoch 100/100, Loss: 0.0007


In [None]:
model.eval()
with torch.no_grad():
    y_pred_probs = model(X_test_tensor).numpy()
    y_pred_labels = (y_pred_probs > 0.5).astype(int)

In [None]:
# Flatten arrays
y_true = y_test_tensor.numpy().flatten()
y_pred = y_pred_labels.flatten()
y_proba = y_pred_probs.flatten()

In [None]:
# Metrics
accuracy = accuracy_score(y_true, y_pred)
auroc = roc_auc_score(y_true, y_proba)
sensitivity = recall_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
f1 = f1_score(y_true, y_pred)

In [None]:
# Print results
print(f"Accuracy:    {accuracy:.4f}")
print(f"AUROC:       {auroc:.4f}")
print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1 Score:    {f1:.4f}")

Accuracy:    0.6863
AUROC:       0.6554
Sensitivity: 0.5600
Specificity: 0.8077
F1 Score:    0.6364


## A CSV file for each dataset containing the predicted class probabilities (one column per class, plus an ID column).


In [None]:
# Predict on original test set
model.eval()
with torch.no_grad():
    prob_class_1_test = model(X_test_tensor).squeeze().numpy()
    prob_class_0_test = 1 - prob_class_1_test

# Use existing ID column or create one
if 'ID' in test_df.columns:
    ids_test = test_df['ID']
else:
    ids_test = pd.Series(range(1, len(test_df) + 1), name="ID")

# Create DataFrame
test_results_df = pd.DataFrame({
    "ID": ids_test,
    "class_0": prob_class_0_test,
    "class_1": prob_class_1_test
})

# Save to CSV
test_results_df.to_csv("test_predictions.csv", index=False)
print("Saved: test_predictions.csv")

Saved: test_predictions.csv


In [None]:
# Predict on original train set
model.eval()
with torch.no_grad():
    prob_class_1_train = model(X_train_tensor).squeeze().numpy()
    prob_class_0_train = 1 - prob_class_1_train

# Use existing ID column or create one
if 'ID' in train_df.columns:
    ids_train = train_df['ID']
else:
    ids_train = pd.Series(range(1, len(train_df) + 1), name="ID")

# Create DataFrame
train_results_df = pd.DataFrame({
    "ID": ids_train,
    "class_0": prob_class_0_train,
    "class_1": prob_class_1_train
})

# Save to CSV
train_results_df.to_csv("train_predictions.csv", index=False)
print("Saved: train_predictions.csv")

Saved: train_predictions.csv


In [None]:
blinded_test_df.head()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_3229,Feature_3230,Feature_3231,Feature_3232,Feature_3233,Feature_3234,Feature_3235,Feature_3236,Feature_3237,Feature_3238
0,13249.25,13323.0,5322.087891,0.40169,0.019253,0.131701,1.965488,0.50878,0.965488,7.621183,...,453.349939,453.349939,1646.0,1.0,162.029162,0.098438,0.0,30.580378,3.888605,0.098438
1,60593.666667,60804.0,21327.521484,0.351976,0.010976,0.042804,2.858719,0.349807,1.858719,38.462982,...,492.250478,492.250478,7853.0,1.0,961.759455,0.12247,0.0,26.690038,3.695084,0.12247
2,51978.833333,52193.0,19574.339844,0.376583,0.010708,0.040742,2.906154,0.344097,1.906154,25.820516,...,482.387417,482.387417,6644.0,1.0,763.046057,0.114847,0.0,30.037774,3.804517,0.114847
3,47737.416667,47943.0,17247.173828,0.361293,0.011891,0.050236,2.710158,0.368982,1.710158,62.531559,...,475.620243,475.620243,6017.0,1.0,718.741732,0.119452,0.0,27.964103,3.69986,0.119452
4,33029.458333,33261.0,15901.136719,0.481423,0.009294,0.030688,3.19406,0.313081,2.19406,11.310782,...,417.949466,417.949466,4116.0,1.0,314.568513,0.076426,0.0,31.80214,4.078748,0.076426


In [None]:
# One-hot encode and align columns
blinded_df_encoded = pd.get_dummies(blinded_test_df)


# Scale using the same scaler used for X_train
blinded_scaled = scaler.transform(blinded_df_encoded)

# Convert to tensor
blinded_tensor = torch.tensor(blinded_scaled, dtype=torch.float32)

In [None]:
model.eval()
with torch.no_grad():
    predicted_probs = model(blinded_tensor)
    predicted_classes = (predicted_probs > 0.5).int()  # Binary classification


In [None]:
# Flatten and display
print("Predicted CLASS labels:")
print(predicted_classes.squeeze().numpy())


Predicted CLASS labels:
[0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0]


In [None]:
model.eval()
with torch.no_grad():
    # Get class 1 probabilities
    prob_class_1 = model(blinded_tensor).squeeze().numpy()
    prob_class_0 = 1 - prob_class_1  # Since binary, class 0 prob is complement


In [None]:
# Use existing ID column if available, or create one
if 'ID' in blinded_test_df.columns:
    ids = blinded_test_df['ID']
else:
    ids = pd.Series(range(1, len(blinded_test_df) + 1), name="ID")

# Create DataFrame with class probabilities
results_df = pd.DataFrame({
    "ID": ids,
    "class_0": prob_class_0,
    "class_1": prob_class_1
})

In [None]:
results_df.to_csv("blinded_predictions.csv", index=False)
print("Saved predictions to blinded_predictions.csv")


Saved predictions to blinded_predictions.csv
