In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('/content/fashion-mnist_train.csv')

In [None]:
print(torch.cuda.is_available())

True


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
torch.manual_seed(42)

<torch._C.Generator at 0x7af3b5498950>

In [None]:
df.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
x = df.iloc[:,1:]
y = df.iloc[:,0]

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
x_train = x_train/255.0
x_test = x_test/255.0

In [None]:
x_train.shape[1]

784

In [None]:
#create coustomdataset class
class Custumdataset(Dataset):
  def __init__(self,features,labels):
    self.features = torch.tensor(features.values , dtype = torch.float32)
    self.labels = torch.tensor(labels.values, dtype =torch.long)

  def __len__(self):
    return len(self.features)

  def __getitem__(self,idx):
    return self.features[idx],self.labels[idx]

In [None]:
test_dataset = Custumdataset(x_test,y_test)

In [None]:
train_dataset = Custumdataset(x_train,y_train)

In [None]:
#nn class
class NeuralNetwork(nn.Module):
  def __init__(self,input_dim,output_dim,num_hidden_layers,num_neuron_layer,dropout_rate):
    super().__init__()
    layers = []

    #creating the hidden layers
    for i in range(num_hidden_layers):
      layers.append(nn.Linear(input_dim,num_neuron_layer))
      layers.append(nn.BatchNorm1d(num_neuron_layer))
      input_dim = num_neuron_layer
      layers.append(nn.ReLU())
      layers.append(nn.Dropout(dropout_rate))
    #last layer
    layers.append(nn.Linear(input_dim,output_dim))

    #
    self.model = nn.Sequential(*layers)


  def forward(self,x):
    return self.model(x)

In [None]:
def objective(trial):
  #intializing the suggesting variables
  num_neuron_layer = trial.suggest_int(name="num_neuron_layer",low=8,high=256,step=8)

  num_hidden_layers = trial.suggest_int("num_hidden_layers",1,5)
  learning_rate = trial.suggest_float("learning_rate",1e-5,1e-1,log=True)
  dropout_rate = trial.suggest_float("dropout_rate",0.1,0.5)
  batch_size = trial.suggest_categorical("batch_size",[32,64,128])
  epochs = trial.suggest_int(name="epochs",low=10,high=70,step=5)

  optimizer_type = trial.suggest_categorical("optimizer_type",["Adam","SGD","RMSprop"])
  weight_decay = trial.suggest_float("weight_decay",1e-5,1e-1, log =True)

#creating dataloader
  train_loader = DataLoader(train_dataset,batch_size,shuffle=True,pin_memory = True)
  test_loader = DataLoader(test_dataset,batch_size,shuffle=False,pin_memory = True)

  #model
  model = NeuralNetwork(x_train.shape[1],10,num_hidden_layers,num_neuron_layer,dropout_rate)
  model.to(device)

  #loss function
  loss = nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(),lr=learning_rate,weight_decay = weight_decay)
  #optimizer
  if optimizer_type == "Adam":
    optimizer = optim.Adam(model.parameters(),lr=learning_rate,weight_decay = weight_decay)
  elif optimizer_type == "SGD":
    optimizer = optim.SGD(model.parameters(),lr=learning_rate,weight_decay = weight_decay)
  else:
    optimizer = optim.ASGD(model.parameters(),lr=learning_rate,weight_decay = weight_decay)

  #training loop
  for epoch in range(epochs):
    batch_loss = 0.0
    for batch_features, batch_labels in train_loader:

      #moving features and lables on gpu before forwardpass
      batch_features = batch_features.to(device)
      batch_labels = batch_labels.to(device)

      #forward pass
      y_pred= model(batch_features)

      #loss calculation
      loss_value = loss(y_pred,batch_labels)
      if torch.isnan(y_pred).any() or torch.isinf(y_pred).any():
        print("NaN in model output!")
        break

      #making the grad values zero n=before backward pass
      optimizer.zero_grad()

      #backward
      loss_value.backward()

      #updating the gradient values on weight and bais
      optimizer.step()
      batch_loss += loss_value.item()

    avg_loss = batch_loss/len(train_loader)
    # print(f"epoch = {epoch+1} and loss = {avg_loss}")


    #setting model into evaluation mode
  model.eval()


    #evaluating performance
  total = 0
  correct = 0
  with torch.no_grad(): # Disable gradient calculation for evaluation
    for batch_features, batch_lables in test_loader:

      #moving features and lables on gpu before forwardpass
      batch_features = batch_features.to(device)
      batch_labels = batch_lables.to(device) # Corrected: move the actual labels to the device

      output = model(batch_features)

      _,predicted = torch.max(output,1)
      total += batch_labels.size(0)
      correct += (predicted == batch_labels).sum().item()
    accuracy = correct/total

  return accuracy



In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/413.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.7.0


In [None]:
import optuna

study =optuna.create_study(direction="maximize")

[I 2026-01-22 10:47:20,436] A new study created in memory with name: no-name-f92f1fb6-ebe4-49df-be51-f4bfcda7615d


In [None]:
study.optimize(objective,n_trials=10)

[I 2026-01-22 10:55:45,298] Trial 2 finished with value: 0.8830833333333333 and parameters: {'num_neuron_layer': 224, 'num_hidden_layers': 3, 'learning_rate': 0.0008129891939378528, 'dropout_rate': 0.2062541094461969, 'batch_size': 64, 'epochs': 50, 'optimizer_type': 'RMSprop', 'weight_decay': 0.014201575092847692}. Best is trial 2 with value: 0.8830833333333333.
[I 2026-01-22 10:59:34,856] Trial 3 finished with value: 0.7350833333333333 and parameters: {'num_neuron_layer': 248, 'num_hidden_layers': 4, 'learning_rate': 1.3483817813978128e-05, 'dropout_rate': 0.17054426347670965, 'batch_size': 32, 'epochs': 40, 'optimizer_type': 'SGD', 'weight_decay': 0.001526555915035431}. Best is trial 2 with value: 0.8830833333333333.
[I 2026-01-22 11:02:38,327] Trial 4 finished with value: 0.8888333333333334 and parameters: {'num_neuron_layer': 184, 'num_hidden_layers': 4, 'learning_rate': 1.9568708166708735e-05, 'dropout_rate': 0.2776953524336488, 'batch_size': 64, 'epochs': 50, 'optimizer_type': '

In [None]:
study.best_value

0.892

In [None]:
study.best_params

{'num_neuron_layer': 168,
 'num_hidden_layers': 5,
 'learning_rate': 8.562301026234446e-05,
 'dropout_rate': 0.11003627658368131,
 'batch_size': 64,
 'epochs': 15,
 'optimizer_type': 'Adam',
 'weight_decay': 0.00020336051240314303}

# Task
Check `x_train` and `x_test` for NaN values to identify and handle data corruption that might be causing the model's loss to become NaN.

## Check for NaNs in data

### Subtask:
Inspect `x_train` and `x_test` for any NaN values that might be causing the loss to become NaN.


**Reasoning**:
To identify if NaN values are present in `x_train` and `x_test`, I will use the `.isna().sum()` method on each DataFrame. This will provide a count of NaN values per column, helping to diagnose potential issues causing loss to become NaN.



In [None]:
print("NaN values in x_train:\n", x_train.isna().sum().sum())
print("NaN values in x_test:\n", x_test.isna().sum().sum())

NaN values in x_train:
 198
NaN values in x_test:
 0


**Reasoning**:
Since `x_train` contains NaN values, I will fill them with 0 to ensure the data is clean and suitable for model training. This will help prevent issues like the NaN loss observed during training.



In [None]:
x_train = x_train.fillna(0)
print("NaN values in x_train after filling:", x_train.isna().sum().sum())

NaN values in x_train after filling: 0


## Re-evaluate model training

### Subtask:
Retrain the model after handling NaN values and observe if the loss stabilizes.


**Reasoning**:
Since `x_train` has been cleaned, I need to re-instantiate the `Custumdataset` for both training and testing datasets using the updated `x_train` and `x_test` to ensure the datasets reflect the changes.



In [None]:
train_dataset = Custumdataset(x_train,y_train)
test_dataset = Custumdataset(x_test,y_test)
print("Custumdataset re-instantiated for training and testing.")

Custumdataset re-instantiated for training and testing.


**Reasoning**:
After re-instantiating the datasets, I will re-create the DataLoaders, re-initialize the model, loss function, and optimizer, and then rerun the training and evaluation loops to observe the impact of NaN handling on the model's performance and loss stability.



In [None]:
train_loader = DataLoader(train_dataset,batch_size =32,shuffle=True,pin_memory = True)
test_loader = DataLoader(test_dataset,batch_size=32,shuffle=False,pin_memory = True)
print("DataLoaders re-created.")

#model intialization
model = NeuralNetwork(x_train.shape[1])
model = model.to(device)
print("Model re-initialized and moved to device.")

#loss function
loss = nn.CrossEntropyLoss()
print("Loss function re-initialized.")

#optimizer
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
print("Optimizer re-initialized.")

#training loop
print("Starting retraining...")
for epoch in range(epochs):
  batch_loss = 0.0
  for batch_features, batch_labels in train_loader:

    #moving features and lables on gpu before forwardpass
    batch_features = batch_features.to(device)
    batch_labels = batch_labels.to(device)

    #forward pass
    y_pred= model(batch_features)

    #loss calculation
    loss_value = loss(y_pred,batch_labels)
    # Removed NaN check as NaNs should be handled now

    #making the grad values zero n=before backward pass
    optimizer.zero_grad()

    #backward
    loss_value.backward()

    #updating the gradient values on weight and bais
    optimizer.step()
    batch_loss += loss_value.item()

  avg_loss = batch_loss/len(train_loader)
  if (epoch + 1) % 10 == 0 or epoch == 0:
    print(f"epoch = {epoch+1} and loss = {avg_loss:.4f}")
print("Retraining complete.")

#setting model into evaluation mode
model.eval()
print("Model set to evaluation mode.")

#evaluating performance
total = 0
correct = 0
with torch.no_grad(): # Disable gradient calculation for evaluation
    for batch_features, batch_labels in test_loader:

      #moving features and lables on gpu before forwardpass
      batch_features = batch_features.to(device)
      batch_labels = batch_labels.to(device)

      output = model(batch_features)

      _,predicted = torch.max(output,1)
      total += batch_labels.size(0)
      correct += (predicted == batch_labels).sum().item()

accuracy = correct/total
print(f"accuracy = {accuracy:.4f}")


DataLoaders re-created.
Model re-initialized and moved to device.
Loss function re-initialized.
Optimizer re-initialized.
Starting retraining...
epoch = 1 and loss = 1.2565
epoch = 10 and loss = 0.3659
epoch = 20 and loss = 0.2303
epoch = 30 and loss = 0.1544
epoch = 40 and loss = 0.0851
epoch = 50 and loss = 0.0441
epoch = 60 and loss = 0.0513
epoch = 70 and loss = 0.0102
epoch = 80 and loss = 0.0724
epoch = 90 and loss = 0.0050
epoch = 100 and loss = 0.0010
Retraining complete.
Model set to evaluation mode.
accuracy = 0.8205


In [None]:
x_train.shape[:]

(2648, 784)