In [5]:
import pandas as pd

file_path = '/mnt/c/Users/atrivedi/Downloads/archive/MBA.csv'
df = pd.read_csv(file_path) # Read csv into pandas dataframe
df['admission'] = df['admission'].fillna(0)  # Assume NaN means 'Rejected' and fill with 0
df['race'] = df['race'].fillna('Unknown')  # Fill missing race with 'Unknown'

print(df.head(50))

    application_id  gender  international   gpa       major      race   gmat  \
0                1  Female          False  3.30    Business     Asian  620.0   
1                2    Male          False  3.28  Humanities     Black  680.0   
2                3  Female           True  3.30    Business   Unknown  710.0   
3                4    Male          False  3.47        STEM     Black  690.0   
4                5    Male          False  3.35        STEM  Hispanic  590.0   
5                6    Male          False  3.18    Business     White  610.0   
6                7  Female          False  2.93        STEM     Other  590.0   
7                8    Male           True  3.02    Business   Unknown  630.0   
8                9    Male          False  3.24    Business     White  590.0   
9               10    Male          False  3.27  Humanities     Asian  690.0   
10              11    Male          False  3.05  Humanities     White  580.0   
11              12    Male           Tru

In [30]:
# Import necessary libraries and modules

import pandas as pd # Import pandas library to load kaggle data into a pandas dataframe
import torch # Import pytorch for building and training the model
import torch.nn as nn # Import neural network module from PyTorch
import torch.optim as optimizer # Import optimization algorithms from Pytorch for backward pass
from sklearn.model_selection import train_test_split # Import test-train split feature
from sklearn.preprocessing import StandardScaler # Import feature scaling feature
from sklearn.metrics import accuracy_score, precision_score, recall_score # Import metrics
import matplotlib.pyplot as plt # Import matplotlib for graphing

# Step 1: Load and Preprocess the data

# Load the dataset
file_path = '/mnt/c/Users/atrivedi/Downloads/archive/MBA.csv' # Local file path to dataset
df = pd.read_csv(file_path) # Read csv into pandas dataframe
df['admission'] = df['admission'].fillna(0)  # Assume NaN means 'Rejected' and fill with 0
df['race'] = df['race'].fillna('Unknown')  # Fill missing race with 'Unknown'


# Preprocess the dataset
df = pd.get_dummies(df, columns=['gender', 'international','race','major', 'work_industry'], drop_first=True) # Convert categorical (strings) variables into dummy (binary) variables so the neural net can work with it, not admission though
df['admission'] = df['admission'].apply(lambda x: 1 if x == 'Admit' else 0) # Convert 'Admit' to 1 and others to 0 for binary classification
X = df.drop('admission', axis=1) # Drop the admission column, so the features and the outputs aren't paired together
y = df['admission'] # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Split the data into training and test sets --> 80-20 split
scaler = StandardScaler() # Create an instance of the StandardScaler class to scale our values
X_train = scaler.fit_transform(X_train) # Scale the training data
X_test = scaler.fit_transform(X_test) # Scale the testing data
X_train_tensor = torch.FloatTensor(X_train)  # Convert training features to PyTorch tensor
X_test_tensor = torch.FloatTensor(X_test)  # Convert test features to PyTorch tensor
y_train_tensor = torch.LongTensor(y_train.values)  # Convert training labels to LongTensor (for classification)
y_test_tensor = torch.LongTensor(y_test.values)  # Convert test labels to LongTensor

# Define MLP Model
class MLP(nn.Module): # Define a class for the MLP model, inheriting from the nn.Module framework
    def __init__(self, input_size): # Initialize the model with the input size
        super(MLP, self).__init__() # Allows MLP class to initialize any properties defined in nn.Module
        self.fc1 = nn.Linear(input_size, 128) # First layer with input size (based on columns of the df) and 128 neurons (output)
        self.fc2 = nn.Linear(128, 64) # Second layer with 128 input neurons and 64 output neurons
        self.fc3 = nn.Linear(64, 2) # Output layer with 64 input neurons and 2 output classes
        self.relu = nn.ReLU() # Initializes the ReLU activaton function to introduce non-linearity into the model
    def forward(self, x): # Define the forward pass 
        x = self.relu(self.fc1(x)) # Apply the first layer and ReLU activation
        x = self.relu(self.fc2(x)) # Apply the second layer and ReLU activation
        x = self.fc3(x) # Pass through the output layer
        return x # Return the final output

# Step 3: Instantiate the model

input_size = X_train.shape[1] # Get the number of input features (columns of the df)
model = MLP(input_size) # Create an instance of the MLP model

# Step 4: Set up the loss function and optimizer

criterion = nn.CrossEntropyLoss() # Define the loss function
optimizer = optimizer.Adam(model.parameters(), lr=0.001)  # Set up the Adam optimizer with a learning rate of 0.001 (this is useful for the backward pass)

# Step 5: Train the model
num_epochs = 1000 # Determine how many epochs of the model we want
for epoch in range(num_epochs): # Loop over each epoch
    model.train # set the model to training mode
    optimizer.zero_grad()  # Clear gradients from the previous step
    outputs = model(X_train_tensor)  # Perform a forward pass to get model predictions
    loss = criterion(outputs, y_train_tensor)  # Compute the loss using the predictions and true labels
    loss.backward()  # Backpropagate the loss to compute gradients
    optimizer.step()  # Update model parameters based on computed gradients

# Step 6: Evaluate the model
model.eval() # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient calculation for evaluation
    y_pred = model(X_test_tensor)  # Get model predictions on the test set
    _, predicted = torch.max(y_pred.data, 1)  # Get the predicted class index (0 or 1)
    accuracy = accuracy_score(y_test_tensor, predicted)  # Calculate accuracy of predictions
    print(f'Accuracy: {accuracy:.4f}')  # Print the accuracy


Accuracy: 0.8273
