<a href="https://colab.research.google.com/github/Dansah2/Sloan-Digital-Sky-Survey---DR18/blob/main/Train_Model_Sloan_Digital_Sky_Survey_DR18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Sloan Digital Sky Survey - DR18

This dataset consists of 100,000 observations from the Data Release (DR) 18 of the Sloan Digital Sky Survey (SDSS). Each observation is described by 42 features and 1 class column classifying the observation as either:

a STAR
a GALAXY
a QSO (Quasi-Stellar Object) or a Quasar.

Kaggle Dataset Download API Command:

kaggle datasets download -d diraf0/sloan-digital-sky-survey-dr18

#Project Outline:
1) Download the dataset

2) Explore/Analyze the data

3) Preprocess and organize the data for ML training

4) Set appropriate weights

5) Create and Train model

##Download / Read the Dataset
1) Install required libraries

2) Import required libraries

3) Upload the data from Google Drive

###Install required libraries

In [None]:
!pip install -q -U kaggle
!pip install -q -U scikit-learn
!pip install -q -U numpy
!pip install -q -U torchmetrics

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.56.4 requires numpy<1.24,>=1.18, but you have numpy 1.26.0 which is incompatible.
tensorflow 2.13.0 requires numpy<=1.24.3,>=1.22, but you have numpy 1.26.0 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h

###Import required libraries

In [None]:
# handeling data
import numpy as np
import pandas as pd

# graphing data
pd.options.plotting.backend = "plotly"
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# downloading data
from google.colab import drive

# splitting data
from torch.utils.data import random_split
from sklearn.model_selection import train_test_split

# training the data
import torch
import torch.nn.functional as F
from torch import nn
import torchmetrics
from torchmetrics import Accuracy
from torch.utils.data import DataLoader, WeightedRandomSampler

# evaluation metrics
from sklearn.metrics import precision_score, recall_score

####Upload the data from Google Drive

In [None]:
# Mount google drive to store Kaggle API for future use
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def read_function(csv_file):
    return pd.read_csv(csv_file)

raw_data = read_function('/content/drive/My Drive/Sloan_Sky_Survey/train_df.csv')


## Create and Train model
1) Set Device Agnostic Code

2) Obtain the Class Weights

3) Define the Focal Loss Function

4) Define the Accuracy Function /Set Hyperparameters

5) Train the Model

### Set Device Agnostic Code

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

### Split the Data into Training and Testing sets

In [None]:
def split_return_tensor(data_frame, target):

  # create X and y varialbles
  y = data_frame[target]
  X = data_frame.drop(columns=target)

  # convert y into type int64
  y = torch.tensor(y.values)
  y = y.type(torch.LongTensor)

  #convert X into type float32
  X = torch.tensor(X.values)
  X = X.type(torch.FloatTensor)

  # convert X and y to a tensor dataset
  tensor_data = torch.utils.data.TensorDataset(X, y)

  # create the train and holding sizes
  train_size = int(0.8 * len(tensor_data))
  hold_size = len(tensor_data) - train_size

  # create the training data
  train_dataset, hold_dataset = random_split(tensor_data, [train_size, hold_size])

  # create validation and testing sizes
  valid_size = int(0.5 * len(hold_dataset))
  test_size = len(hold_dataset) - valid_size

  # create the validation and testing data
  vaild_dataset, test_dataset = random_split(hold_dataset, [valid_size, test_size])

  return train_dataset, vaild_dataset, test_dataset

train_tensor, valid_tensor, test_tensor = split_return_tensor(raw_data, 'e_class')

### Create the Model

In [None]:
class Model(nn.Module):
  def __init__(self, inputs, outputs, dropout_prob, hidden=8):
    super().__init__()
    self.linear_layer_stack = nn.Sequential(
        nn.Linear(in_features=inputs, out_features=hidden),
        nn.BatchNorm1d(hidden),
        nn.ReLU(),
        nn.Dropout(p=dropout_prob),
        nn.Linear(in_features=hidden, out_features=hidden),
        nn.BatchNorm1d(hidden),
        nn.ReLU(),
        nn.Dropout(p=dropout_prob),
        nn.Linear(in_features=hidden, out_features=outputs)
    )

  def forward(self, x):
    return self.linear_layer_stack(x)

input_num = (raw_data.shape[1] - 1)
output_num = 3

model = Model(inputs=input_num,
              outputs=output_num,
              dropout_prob=0.2)

model

Model(
  (linear_layer_stack): Sequential(
    (0): Linear(in_features=36, out_features=8, bias=True)
    (1): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=8, out_features=8, bias=True)
    (5): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=8, out_features=3, bias=True)
  )
)

###Obtain the Class Weights


In [None]:
def get_class_weights(data_frame, target):
  # create X and y varialbles
  y = data_frame[target]
  X = data_frame.drop(columns=target)

  # split the data
  X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        shuffle=True)

  class_counts = np.bincount(y_train)
  num_classes = len(class_counts)
  total_samples = len(y_train)

  class_weights = []
  for count in class_counts:
      weight = 1 / (count / total_samples)
      class_weights.append(weight)

  return class_weights


class_weights = get_class_weights(raw_data, 'e_class')

In [None]:
# convert list type to tensor
class_weights = torch.Tensor(class_weights)
class_weights.to(device)
class_weights

tensor([1.9115, 9.5900, 2.6840])

###Define the Focal Loss Function

In [None]:
class FocalLoss(nn.Module):
  def __init__(self, alpha=None, gamma=2):
      super(FocalLoss, self).__init__()
      self.alpha = alpha
      self.gamma = gamma

      # Check if alpha is provided and if it's a tensor with non-zero values
      if self.alpha is not None:
          if not isinstance(self.alpha, torch.Tensor) or (self.alpha == 0).any():
              raise ValueError("alpha should be a tensor with non-zero values for all classes.")

  def forward(self, inputs, targets):
      ce_loss = F.cross_entropy(inputs, targets, reduction='none')
      ce_loss = torch.clamp(ce_loss, min=1e-10, max=1e10)
      pt = torch.exp(-ce_loss) + 1e-10
      loss = (self.alpha[targets] * (1 - pt) ** self.gamma * ce_loss).mean()
      return loss

loss_fn = FocalLoss(alpha=class_weights, gamma=2)

###Define the Accuracy Function /Set Hyperparameters

In [None]:
# create the accuracy function
accuracy = Accuracy(task='multiclass', num_classes=3)

# set learning rate
LR = 0.0011

# create an optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=LR)

###Train the Model

In [None]:
# create the training method
def train_model(model, tensor_train_data, tensor_valid_data, loss_fn, accuracy_fn, optimizer, device: torch.device = device):
  # set number of epochs
  EPOCHS = 101

  # put the data model and accuracy function on the desired device
  model = model.to(device)
  accuracy = accuracy_fn.to(device)
  loss_fn = loss_fn.to(device)

  # create the training and validation loaders
  train_loader = torch.utils.data.DataLoader(tensor_train_data, batch_size=250, shuffle=True)
  valid_loader = torch.utils.data.DataLoader(tensor_valid_data, batch_size=250)

  train_losses = []
  valid_losses = []
  train_accuracies = []
  valid_accuracies = []

  #loop through the data
  for epoch in range(EPOCHS):

    # set model to train
    model.train()

    # Iterate over the DataLoader for training data
    for X_train, y_train in train_loader:
      #put the data on the correct device
      X_train = X_train.to(device)
      y_train = y_train.to(device)

      # forward pass
      y_logits = model(X_train)

      # convert logits into probabilites then prediciton labels
      y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1)

      # calculate the loss
      loss = loss_fn(y_logits, y_train)

      # optimizer zero_grad
      optimizer.zero_grad()

      # back propagation
      loss.backward()

      # optimizer step
      optimizer.step()


    ### Validation

    # put model into eval and inference mode
    model.eval()
    with torch.inference_mode():
      # training and validation metrics
      total_accuracy = 0
      total_loss = 0
      all_valid_preds = []
      all_valid_labels = []

      for X_valid, y_valid in valid_loader:

        # create the valid logits
        valid_logits = model(X_valid)

        # convert logits into prediction proabailites then prediction labels
        valid_preds = torch.softmax(valid_logits, dim=1).argmax(dim=1)

        # calculate the loss
        valid_loss = loss_fn(valid_logits, y_valid)

        # calculate the accuracy
        valid_acc = accuracy(valid_preds, y_valid)

        # total accuracy and loss
        total_accuracy += valid_acc.item()
        total_loss += valid_loss.item()

        #validation metrics
        all_valid_preds.extend(valid_preds.cpu().numpy())
        all_valid_labels.extend(y_valid.cpu().numpy())

      # epoch accuracy and loss
      epoch_accuracy = total_accuracy / len(valid_loader)
      epoch_loss = total_loss / len(valid_loader)

      # Calculate precision and recall
      precision = precision_score(all_valid_labels, all_valid_preds, average='macro', zero_division=1.0)
      recall = recall_score(all_valid_labels, all_valid_preds, average='macro', zero_division=1.0)

      train_losses.append(loss.item())
      valid_losses.append(epoch_loss)
      train_accuracies.append(epoch_accuracy)
      valid_accuracies.append(epoch_accuracy)

      if epoch % 10 == 0:
        print(f'Epoch {epoch} accuracy: {epoch_accuracy:.2f}%')

  print(f'\nValidation Precision: {precision:.2f}')
  print(f'Validation Recall: {recall:.2f}')

  # Create subplots for training and validation metrics
  fig = make_subplots(rows=2, cols=1, subplot_titles=("Training Metics", "Validation Metrics"))

  # Add training loss and accuracy traces to the first figure
  fig.add_trace(go.Scatter(x=list(range(EPOCHS)), y=train_losses, mode='lines', name='Training Loss'), row=1, col=1)
  fig.add_trace(go.Scatter(x=list(range(EPOCHS)), y=train_accuracies, mode='lines', name='Training Accuracy'), row=1, col=1)

  # Add validation loss and accuracy traces to the second figure
  fig.add_trace(go.Scatter(x=list(range(EPOCHS)), y=valid_losses, mode='lines', name='Validation Loss'), row=2, col=1)
  fig.add_trace(go.Scatter(x=list(range(EPOCHS)), y=valid_accuracies, mode='lines', name='Validation Accuracy'), row=2, col=1)

  # Update layout for both figures
  fig.update_xaxes(title_text="Epochs", row=1, col=1)
  fig.update_xaxes(title_text="Epochs", row=2, col=1)
  fig.update_yaxes(title_text="Training Loss", row=1, col=1)
  fig.update_yaxes(title_text="Validation Loss", row=2, col=1)

  # Show the two plots
  fig.show()

  return model

# call the training method
model = train_model(model=model,
                    tensor_train_data=train_tensor,
                    tensor_valid_data=valid_tensor,
                    loss_fn=loss_fn,
                    accuracy_fn=accuracy,
                    optimizer=optimizer)

Epoch 0 accuracy: 0.79%
Epoch 10 accuracy: 0.79%
Epoch 20 accuracy: 0.79%
Epoch 30 accuracy: 0.79%
Epoch 40 accuracy: 0.79%
Epoch 50 accuracy: 0.78%
Epoch 60 accuracy: 0.79%
Epoch 70 accuracy: 0.75%
Epoch 80 accuracy: 0.79%
Epoch 90 accuracy: 0.78%
Epoch 100 accuracy: 0.79%

Validation Precision: 0.53
Validation Recall: 0.57


####Save the model to Google Drive

In [None]:
torch.save(model.state_dict(), '/content/drive/My Drive/Sloan_Sky_Survey/Sloan_Model.pth')