In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from torch import nn
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

from pytorch_lightning import LightningModule
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import pytorch_lightning as pl
from torchmetrics.classification import F1Score, MulticlassCohenKappa

In [68]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "synthetic_student_performance.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "miadul/student-performance-dataset",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)



In [69]:
pl.seed_everything(42)

Seed set to 42


42

In [70]:
df.set_index('StudentID', inplace=True)
df

Unnamed: 0_level_0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
StudentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1001,17,1,1,3,16,13,0,2,0,0,0,0,3.158425,1
1002,18,1,2,0,17,28,0,2,1,1,1,0,2.794655,2
1003,15,1,0,1,13,18,1,4,0,0,0,1,3.774847,0
1004,17,1,0,3,6,4,0,3,1,0,0,0,3.040574,1
1005,17,0,1,2,6,20,0,4,0,0,0,0,3.382310,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5996,15,0,3,2,12,22,0,2,0,1,1,0,4.000000,0
5997,16,0,1,2,1,2,0,2,0,1,0,0,2.533415,2
5998,17,1,1,3,18,18,1,0,1,0,0,1,4.000000,0
5999,16,0,2,2,15,0,0,2,1,1,1,0,4.000000,0


In [71]:
X = df.drop('GradeClass', axis=1)
y = df['GradeClass']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [72]:
num = [col for col in X_train.columns if df[col].unique().size > 5]
cat = [col for col in X_train.columns if col not in num]

In [73]:
num_pipe = Pipeline([
  ('imputer', SimpleImputer(strategy='median')),
  ('transform', PowerTransformer(method='yeo-johnson')),
  ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
  ('imputer', SimpleImputer(strategy='most_frequent')),
  ('encoder', OneHotEncoder())
])
preprocessor = ColumnTransformer([
  ('num', num_pipe, num),
  ('cat', cat_pipe, cat)
])

In [74]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [75]:
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.long)
y_test = torch.tensor(y_test.values, dtype=torch.long)

In [76]:
train_set = TensorDataset(X_train, y_train)
test_set = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False)

In [77]:
feature, target = next(iter(train_loader))
feature.shape, target.shape

(torch.Size([64, 33]), torch.Size([64]))

In [78]:
class NN(nn.Module):
    def __init__(self, input_size) -> None:
        super().__init__()

        self.fc = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(32, 16),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(16, 8),
            nn.BatchNorm1d(8),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(8, 5)
        )

    def forward(self, X):
        return self.fc(X)

In [79]:
class PLNN(LightningModule):
    def __init__(self, input_size, num_classes=5, learning_rate=1e-3):
        super().__init__()
        self.save_hyperparameters() 
        self.model = NN(input_size=input_size)
        self.criterion = nn.CrossEntropyLoss()
        self.kappa = MulticlassCohenKappa(num_classes=num_classes, weights='linear')
    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs) 
        loss = self.criterion(outputs, labels)
        acc = self.kappa(outputs, labels)
        
        self.log('train_loss', loss, on_epoch=True, prog_bar=True)
        self.log('train_kappa', acc, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = self.criterion(outputs, labels)
        acc = self.kappa(outputs, labels)

        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log('val_kappa', acc, on_epoch=True, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams.learning_rate)
        return optimizer

In [80]:
model = NN(X_train.shape[1])

In [81]:
if torch.cuda.is_available():
    accelerator_type = 'gpu'
    devices_to_use = 1
else:
    accelerator_type = 'cpu'
    devices_to_use = 'auto'

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='checkpoints/',
    filename='stp-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1,
    mode='min'
)
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    mode='min',
)
lr_monitor_callback = LearningRateMonitor(logging_interval='epoch')

trainer1 = pl.Trainer(
    max_epochs=300,
    callbacks=[checkpoint_callback, early_stopping, lr_monitor_callback],
    logger=TensorBoardLogger("tb_logs", name="simple_model_experiment"),
    accelerator=accelerator_type,
    devices=devices_to_use,
    log_every_n_steps=10,
    deterministic=True
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [82]:
model = PLNN(input_size=X_train.shape[1], num_classes=5, learning_rate=1e-3)
trainer1.fit(model, train_loader, test_loader)


  | Name      | Type                 | Params | Mode 
-----------------------------------------------------------
0 | model     | NN                   | 15.9 K | train
1 | criterion | CrossEntropyLoss     | 0      | train
2 | kappa     | MulticlassCohenKappa | 0      | train
-----------------------------------------------------------
15.9 K    Trainable params
0         Non-trainable params
15.9 K    Total params
0.064     Total estimated model params size (MB)
25        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [83]:
trainer1.validate(model, test_loader, ckpt_path='best')

Restoring states from the checkpoint path at C:\Users\dsapu\Project\Python\repo\dlp\StudentP\checkpoints\stp-epoch=33-val_loss=0.13.ckpt
Loaded model weights from the checkpoint at C:\Users\dsapu\Project\Python\repo\dlp\StudentP\checkpoints\stp-epoch=33-val_loss=0.13.ckpt


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': 0.13271014392375946, 'val_kappa': 0.9571194648742676}]