In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/graduate-admissions/Admission_Predict.csv
/kaggle/input/graduate-admissions/Admission_Predict_Ver1.1.csv


In [2]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations
import torch         # PyTorch main package
import torch.nn as nn  # Neural network modules
import torch.optim as optim  # Optimization algorithms
from torch.utils.data import DataLoader, TensorDataset  # For data handling
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.preprocessing import StandardScaler  # For feature scaling
from sklearn.metrics import mean_absolute_error, mean_squared_error  # Evaluation metrics

In [3]:
'''
DATA PREPROCESSING SECTION

Why we do this:
- Real-world data is often messy and needs cleaning
- Neural networks work better with normalized/standardized data
- Proper data splitting helps evaluate model performance accurately
'''

# Load dataset from CSV file
df = pd.read_csv('/kaggle/input/graduate-admissions/Admission_Predict_Ver1.1.csv')

In [4]:
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [5]:
df.shape

(500, 9)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         500 non-null    int64  
 1   GRE Score          500 non-null    int64  
 2   TOEFL Score        500 non-null    int64  
 3   University Rating  500 non-null    int64  
 4   SOP                500 non-null    float64
 5   LOR                500 non-null    float64
 6   CGPA               500 non-null    float64
 7   Research           500 non-null    int64  
 8   Chance of Admit    500 non-null    float64
dtypes: float64(4), int64(5)
memory usage: 35.3 KB


In [7]:
df.duplicated().sum()

0

In [8]:
# Clean column names (remove spaces and trailing characters)
# This helps prevent errors when accessing columns
df.columns = df.columns.str.strip().str.replace(' ', '_')

In [9]:
# Drop irrelevant column (Serial Number doesn't affect admission chances)
# Always remove non-predictive features
df = df.drop('Serial_No.', axis=1)

In [10]:
# Separate features (input variables) and target (output variable)
# X contains all factors affecting admission chance
# y contains the actual admission probabilities we want to predict
X = df.drop('Chance_of_Admit', axis=1).values  # Features matrix
y = df['Chance_of_Admit'].values.reshape(-1, 1)  # Target variable

In [11]:
# Split data into training (80%) and test (20%) sets
# We keep test set separate to evaluate model performance on unseen data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42  # random_state for reproducibility
)

In [12]:
# Standardize features (mean=0, std=1)
# Neural networks require scaled data for optimal performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit and transform training data
X_test = scaler.transform(X_test)        # Transform test data using training fit

In [21]:
X_train

array([[ 0.38998634,  0.6024183 , -0.09829757, ...,  0.56498381,
         0.4150183 ,  0.89543386],
       [-0.06640493,  0.6024183 ,  0.7754586 , ...,  1.65149114,
        -0.06785154, -1.11677706],
       [-1.25302222, -0.87691722, -0.09829757, ..., -0.52152352,
        -0.13445427, -1.11677706],
       ...,
       [-1.34430047, -1.37002906, -1.8458099 , ..., -1.60803084,
        -2.2157898 , -1.11677706],
       [-0.7053527 , -0.38380538, -0.97205374, ...,  0.56498381,
        -1.49981038, -1.11677706],
       [-0.24896144, -0.21943477, -0.97205374, ...,  0.02173015,
        -0.55072138, -1.11677706]])

In [13]:
# Convert numpy arrays to PyTorch tensors
# PyTorch works with tensors instead of regular arrays
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [14]:
# Create DataLoader for batch training
# Batches help with:
# 1. Memory efficiency (don't load all data at once)
# 2. Better gradient updates
# 3. Random shuffling reduces overfitting
batch_size = 32  # Number of samples per batch (common to use powers of 2)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

'''
NEURAL NETWORK ARCHITECTURE

Why this structure:
- Fully connected layers (Linear layers) are good for tabular data
- ReLU activation adds non-linearity (neural networks need this to learn complex patterns)
- Dropout prevents overfitting (randomly turns off neurons during training)
- Three layers: Input -> Hidden -> Hidden -> Output (common starting point)
'''

In [16]:
class AdmissionPredictor(nn.Module):
    def __init__(self, input_dim):
        super(AdmissionPredictor, self).__init__()
        # Layer 1: Input layer to first hidden layer
        self.fc1 = nn.Linear(input_dim, 64)  # 64 neurons in first hidden layer
        # Layer 2: Hidden layer to another hidden layer
        self.fc2 = nn.Linear(64, 32)         # 32 neurons in second hidden layer
        # Layer 3: Final hidden layer to output
        self.fc3 = nn.Linear(32, 1)          # 1 output neuron (regression task)
        
        # Activation function (introduces non-linearity)
        self.relu = nn.ReLU()  # Simple and effective activation function
        
        # Regularization (prevents model from memorizing training data)
        self.dropout = nn.Dropout(0.2)  # Randomly disable 20% of neurons during training
        
    def forward(self, x):
        # Forward pass through network
        x = self.relu(self.fc1(x))  # Activation after first layer
        x = self.dropout(x)         # Apply dropout
        x = self.relu(self.fc2(x))  # Activation after second layer
        x = self.dropout(x)         # Apply dropout
        x = self.fc3(x)             # Final output (no activation for regression)
        return x

# Initialize model with proper input dimension
input_dim = X_train.shape[1]  # Number of features (7 in this dataset)
model = AdmissionPredictor(input_dim)


'''
LOSS FUNCTION AND OPTIMIZER

Why MSE and Adam:
- MSE (Mean Squared Error) is common for regression problems
- Adam optimizer combines benefits of two other optimizers (AdaGrad and RMSProp)
- Learning rate (0.001) is a good starting point for many problems
'''

In [17]:
criterion = nn.MSELoss()  # Measures average squared difference
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adaptive learning rate



TRAINING LOOP

Key concepts:
- Epochs: Complete passes through the training data
- Batch processing: Updates weights after each batch (not whole dataset)
- Backpropagation: Calculates gradients of loss w.r.t. parameters
- Gradient descent: Updates weights to minimize loss


In [18]:
num_epochs = 200  # Number of complete passes through dataset
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        # Forward pass: Compute predictions
        outputs = model(inputs)
        
        # Calculate loss (difference between predictions and true values)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()  # Clear previous gradients
        loss.backward()        # Compute gradients through backpropagation
        optimizer.step()       # Update weights using gradients
        
    # Print progress every 20 epochs
    if (epoch+1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [20/200], Loss: 0.0289
Epoch [40/200], Loss: 0.0142
Epoch [60/200], Loss: 0.0081
Epoch [80/200], Loss: 0.0072
Epoch [100/200], Loss: 0.0059
Epoch [120/200], Loss: 0.0058
Epoch [140/200], Loss: 0.0056
Epoch [160/200], Loss: 0.0038
Epoch [180/200], Loss: 0.0054
Epoch [200/200], Loss: 0.0060


'''
EVALUATION

Why we use both MAE and MSE:
- MAE (Mean Absolute Error) is easier to interpret (average error)
- MSE (Mean Squared Error) penalizes larger errors more heavily
- Both help understand model performance in different ways
'''

In [19]:
model.eval()  # Set model to evaluation mode (disables dropout)
with torch.no_grad():  # Disable gradient calculation for efficiency
    test_outputs = model(X_test_tensor)
    test_loss = criterion(test_outputs, y_test_tensor)
    mae = mean_absolute_error(y_test, test_outputs.numpy())
    mse = mean_squared_error(y_test, test_outputs.numpy())
    
print(f'\nTest MSE: {test_loss.item():.4f}')
print(f'Test MAE: {mae:.4f}')
print(f'Test MSE: {mse:.4f}')

# Example prediction to show actual vs predicted
example_data = X_test_tensor[0:1]  # First test sample
prediction = model(example_data)
print(f'\nExample prediction:')
print(f'Actual: {y_test[0][0]:.4f}')
print(f'Predicted: {prediction.item():.4f}')


Test MSE: 0.0040
Test MAE: 0.0453
Test MSE: 0.0040

Example prediction:
Actual: 0.9300
Predicted: 0.9032
