### 1. Load and inspect data

In [32]:
import pandas as pd

# load csv file using pandas
data_path = 'data.csv'
data = pd.read_csv(data_path)           # data is a dataframe/2d tabular representation

print(data)

    x_0  x_1  x_2  y
0  1.00    0    0  0
1  0.00    0    5  0
2  1.00    1    3  1
3  0.00    1    1  0
4  0.00    1    1  1
5  0.00    1    1  0
6  3.71    0    1  1
7  1.10    0    1  0
8  1.00    0    0  1
9  1.00    1    1  0


In [33]:
# check number of data points (rows) and number of features (columns except target 'y')

print(data.shape)  # returns tuple (rows, columns)

(10, 4)


- Number of rows: 10
- Number of columns: 4
<br><br>
- Number of features: 3 (since 'y' is a column in this case)
- Number of data points: rows x features 
    - 3 x 10 : 30 data points
<br><br>

Features: measurable properties/attributes we can use to predict


### Range of features

- By knowing the range of each feature, we can apply proper normalization (e.g., min-max scaling or standardization) to ensure all features contribute proportionately during training
    - For ex., if the range of one feature is 10 times larger than that of another, then during loss minimization, the gradients associated with the larger-scaled feature will likely be larger. This disproportion can cause the optimization process to overemphasize that feature, even though that feature might not actually be too influential in the prediction, potentially skewing weight updates and adversely affecting the overall training process

    

In [34]:
# determine range of each feature

# range: max - min

features_columns = [col for col in data if col != 'y']
feature_ranges = {}
for feature in features_columns:
    min_val = data[feature].min()
    max_val = data[feature].max()
    feature_ranges[feature] = float(max_val - min_val)

print("range of features: ")
for feature in feature_ranges.items():
    print(feature)

range of features: 
('x_0', 3.71)
('x_1', 1.0)
('x_2', 5.0)


### Model and package selection

- Because the target column consists of 0s and 1s, this is likely a binary classification problem (predicting y from x features)
    - Use a feedforward neural network
- Use  pytorch for defining the model, training, evals
- Use the scikit-learn package to split the data

In [35]:
# -------------------------
# prepare the data for pytorch
# -------------------------

import torch
import numpy as np

# separate features and target, converting them to numpy arrays with type float32
features_values = data[features_columns].values
target_values = data['y'].values 

# convert the numpy arrays to pytorch tensors
tensor_features = torch.Tensor(features_values)
tensor_target = torch.Tensor(target_values)

### Split the data (80% train, 20% test)

In [36]:
from sklearn.model_selection import train_test_split


# split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(features_values, target_values, test_size=0.2, random_state=42) 

tensor_x_train = torch.Tensor(x_train)
tensor_x_test = torch.Tensor(x_test)
tensor_y_train = torch.Tensor(y_train)
tensor_y_test = torch.Tensor(y_test)

### Normalization

In [37]:
from sklearn.preprocessing import StandardScaler

# initialize the scaler and fit it only on the training data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

### Create pytorch datasets and dataloaders for the train and test sets

TODO:
- Why? also know exactly what that code is doing
- what is batch size? why 2?
- why set shuffle to true for train and false for test?

In [38]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(tensor_x_train, tensor_y_train)
test_dataset = TensorDataset(tensor_x_test, tensor_y_test)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)        
test_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=False)      

### Build the neural network

TODO:
- how to determine number of hidden layers? and number of neurons they each take? 
- why relu?

In [39]:
import torch.nn as nn

class NeuralNetwork(nn.Module):
    def __init__(self, input_features=3):
        super().__init__()

        # number of features = number of input neurons

        # Input layer (3 features) 
            # -> Hidden layer1 (10 neurons) 
            # -> Hidden layer2 (5 neurons) 
            # -> output (2 neurons, 1 output value)

        self.model = nn.Sequential(
            nn.Linear(input_features, 10),   # first hidden layer with 10 neurons
            nn.ReLU(),                       # activation function RELU: max(0, x) 
            nn.Linear(10, 5),                # second hidden layer with 5 neurons
            nn.ReLU(),
            nn.Linear(5, 1),                 # output layer
            nn.Sigmoid()                     # final activation for binary classification (probabilities between 0 and 1)
        )  
        
    def forward(self, x): 
        return self.model(x)


- why BCEloss?
- why adam optimizer?

In [40]:
import torch.optim as optim

# intialize the model, loss function, and optimizer

input_dim = len(features_columns)
model = NeuralNetwork(input_dim)
loss_criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

### Train method to train the model & print training accuracy

- what is epoch? how to determine its value?
- what is batch?
- what is .zero_grad()?
- determine what EACH line of code is doing in this

In [43]:
def train_model(model, dataloader, loss_criterion, optimizer, num_epochs=50):
    model.train()   # set to training mode


    for epoch in range(num_epochs):
        total_loss = 0
        total += 1
        for batch_features, batch_labels in dataloader:
            optimizer.zero_grad()                           # reset gradients
            y_pred = model(batch_features)                  # forward pass using batch data
            loss = loss_criterion(y_pred.squeeze(), batch_labels)  # compute loss using batch data
            loss.backward()                                 # backpropagation
            optimizer.step()                                # update weights
            total_loss += loss.item()
        
        avg_loss = total_loss / len(dataloader)

        model.eval() # set to eval mode
        correct = 0
        total = 0
        with torch.no_grad():
            for features, labels in test_dataloader:
                outputs = model(features)
                preds = outputs
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        accuracy = correct /total


        print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")


train_model(model, train_dataloader, loss_criterion, optimizer, num_epochs=50)

UnboundLocalError: cannot access local variable 'total' where it is not associated with a value

- How can this be done better? That improves accuracy? (add those changes as comments)
    - bigger scale/more layers/neurons
    - dropout... (LEARN WHAT THIS IS)
    - 

- If this was time series how would you take that into account?

In [44]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

# -------------------------
# load and inspect the data
# -------------------------
data_path = 'data.csv'
data = pd.read_csv(data_path)  # load csv file into a dataframe

# print the dataframe
print(data)

# print the number of data points (rows) and number of features (columns)
print("data shape (rows, columns):", data.shape)

# assume that the target column is named 'y'
features_columns = [col for col in data.columns if col != 'y']

# calculate the range (max - min) for each feature
feature_ranges = {}
for feature in features_columns:
    min_val = data[feature].min()
    max_val = data[feature].max()
    feature_ranges[feature] = float(max_val - min_val)

print("range of features:")
for feature, rng in feature_ranges.items():
    print(f"{feature}: {rng}")

# -------------------------
# prepare the data for pytorch
# -------------------------
# separate features and target and convert to numpy arrays
features_values = data[features_columns].values.astype(np.float32)
target_values = data['y'].values.astype(np.float32)

# split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(
    features_values, target_values, test_size=0.2, random_state=42
)

# convert numpy arrays to pytorch tensors
tensor_x_train = torch.tensor(x_train)
tensor_x_test = torch.tensor(x_test)
tensor_y_train = torch.tensor(y_train)
tensor_y_test = torch.tensor(y_test)

# create pytorch datasets and dataloaders
train_dataset = TensorDataset(tensor_x_train, tensor_y_train)
test_dataset = TensorDataset(tensor_x_test, tensor_y_test)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
# fix: use test_dataset for the test dataloader
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# -------------------------
# build the initial neural network model
# -------------------------
class NeuralNetwork(nn.Module):
    def __init__(self, input_features):
        super().__init__()
        # simple feed-forward network:
        # input -> hidden1 (10 neurons) -> hidden2 (5 neurons) -> output (1 neuron)
        self.model = nn.Sequential(
            nn.Linear(input_features, 10),  # first hidden layer
            nn.ReLU(),                      # activation function
            nn.Linear(10, 5),               # second hidden layer
            nn.ReLU(),                      # activation function
            nn.Linear(5, 1),                # output layer
            nn.Sigmoid()                    # sigmoid for binary classification
        )

    def forward(self, x):
        return self.model(x)

input_dim = len(features_columns)
model = NeuralNetwork(input_dim)

# define loss function and optimizer
loss_criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# -------------------------
# training function that prints loss and accuracy after each epoch
# -------------------------
def train_model(model, train_loader, test_loader, loss_fn, optimzr, num_epochs=50):
    for epoch in range(num_epochs):
        model.train()  # set model to training mode
        total_loss = 0
        
        # training loop
        for batch_features, batch_labels in train_loader:
            optimzr.zero_grad()  # reset gradients
            predictions = model(batch_features)  # forward pass
            loss = loss_fn(predictions.squeeze(), batch_labels)
            loss.backward()  # backpropagation
            optimzr.step()   # update weights
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        
        # evaluation on test set for accuracy
        model.eval()  # set to evaluation mode
        correct = 0
        total = 0
        with torch.no_grad():
            for features, labels in test_loader:
                outputs = model(features)
                # convert probabilities to binary predictions
                preds = (outputs.squeeze() >= 0.5).float()
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        accuracy = correct / total
        
        print(f"epoch {epoch+1}/{num_epochs}, average loss: {avg_loss:.4f}, test accuracy: {accuracy:.4f}")

print("training initial model...")
train_model(model, train_dataloader, test_dataloader, loss_criterion, optimizer, num_epochs=50)

# -------------------------
# build an improved model (example: adding dropout to reduce overfitting)
# -------------------------
class ImprovedNeuralNetwork(nn.Module):
    def __init__(self, input_features):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_features, 16),   # increase neurons in first layer
            nn.ReLU(),
            nn.Dropout(0.2),                 # dropout to prevent overfitting
            nn.Linear(16, 8),                # hidden layer with dropout effect
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(8, 1),                 # output layer
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# initialize improved model, loss function, and optimizer
improved_model = ImprovedNeuralNetwork(input_dim)
improved_optimizer = optim.Adam(improved_model.parameters(), lr=0.01)

print("\ntraining improved model...")
train_model(improved_model, train_dataloader, test_dataloader, loss_criterion, improved_optimizer, num_epochs=50)

# -------------------------
# handling time series data
# -------------------------
# if the data were a time series, we would need to account for the sequential order.
# potential approaches include:
#
# - using recurrent neural networks (rnn, lstm, or gru) to capture temporal dependencies.
# - designing the input so that it includes lagged observations (e.g., sliding windows)
# - using transformer-based models that are designed for sequential data.
#
# for example, an lstm-based model might look like:
#
# class LSTMModel(nn.Module):
#     def __init__(self, input_features, hidden_size, num_layers):
#         super().__init__()
#         self.lstm = nn.LSTM(input_features, hidden_size, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_size, 1)
#         self.sigmoid = nn.Sigmoid()
#
#     def forward(self, x):
#         # assume x has shape (batch, sequence_length, input_features)
#         lstm_out, _ = self.lstm(x)
#         # take output of last time step
#         last_output = lstm_out[:, -1, :]
#         out = self.fc(last_output)
#         return self.sigmoid(out)
#
# the training procedure would then need to handle sequential batches accordingly.


    x_0  x_1  x_2  y
0  1.00    0    0  0
1  0.00    0    5  0
2  1.00    1    3  1
3  0.00    1    1  0
4  0.00    1    1  1
5  0.00    1    1  0
6  3.71    0    1  1
7  1.10    0    1  0
8  1.00    0    0  1
9  1.00    1    1  0
data shape (rows, columns): (10, 4)
range of features:
x_0: 3.71
x_1: 1.0
x_2: 5.0
training initial model...
epoch 1/50, average loss: 0.6708, test accuracy: 0.5000
epoch 2/50, average loss: 0.6662, test accuracy: 0.5000
epoch 3/50, average loss: 0.6603, test accuracy: 0.5000
epoch 4/50, average loss: 0.6571, test accuracy: 0.5000
epoch 5/50, average loss: 0.6499, test accuracy: 0.5000
epoch 6/50, average loss: 0.6445, test accuracy: 0.5000
epoch 7/50, average loss: 0.6406, test accuracy: 0.5000
epoch 8/50, average loss: 0.6368, test accuracy: 0.5000
epoch 9/50, average loss: 0.6305, test accuracy: 0.5000
epoch 10/50, average loss: 0.6237, test accuracy: 0.5000
epoch 11/50, average loss: 0.6185, test accuracy: 0.5000
epoch 12/50, average loss: 0.6096, test ac

In [52]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader

# Load and preprocess
data = pd.read_csv('data.csv')
data = data.fillna(data.median())
X = data.drop('y', axis=1).values.astype(np.float32)
y = data['y'].values.astype(np.float32)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

# Create dataloaders
train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
test_dataset = TensorDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Model
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Initialize
model = NeuralNetwork(X.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train
for epoch in range(20):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(X_batch).squeeze()
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        train_loss = loss.item()
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')
        
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for X_batch, y_batch in test_loader:
            y_pred = model(X_batch).squeeze()
            predicted = (y_pred >= 0.5).float()
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()
        accuracy = 100 * correct/total
        print(f'Epoch {epoch+1}, Accuracy: {accuracy:.2f}%')

print(f'\nFinal accuracy: {accuracy:.2f}%')

Epoch 1, Train Loss: 0.6779
Epoch 1, Accuracy: 50.00%
Epoch 2, Train Loss: 0.6871
Epoch 2, Accuracy: 50.00%
Epoch 3, Train Loss: 0.6810
Epoch 3, Accuracy: 50.00%
Epoch 4, Train Loss: 0.6795
Epoch 4, Accuracy: 50.00%
Epoch 5, Train Loss: 0.6774
Epoch 5, Accuracy: 50.00%
Epoch 6, Train Loss: 0.6798
Epoch 6, Accuracy: 50.00%
Epoch 7, Train Loss: 0.6736
Epoch 7, Accuracy: 50.00%
Epoch 8, Train Loss: 0.6749
Epoch 8, Accuracy: 50.00%
Epoch 9, Train Loss: 0.6718
Epoch 9, Accuracy: 50.00%
Epoch 10, Train Loss: 0.6687
Epoch 10, Accuracy: 50.00%
Epoch 11, Train Loss: 0.6581
Epoch 11, Accuracy: 50.00%
Epoch 12, Train Loss: 0.6681
Epoch 12, Accuracy: 50.00%
Epoch 13, Train Loss: 0.6690
Epoch 13, Accuracy: 50.00%
Epoch 14, Train Loss: 0.6666
Epoch 14, Accuracy: 50.00%
Epoch 15, Train Loss: 0.6575
Epoch 15, Accuracy: 50.00%
Epoch 16, Train Loss: 0.6571
Epoch 16, Accuracy: 50.00%
Epoch 17, Train Loss: 0.6607
Epoch 17, Accuracy: 50.00%
Epoch 18, Train Loss: 0.6524
Epoch 18, Accuracy: 50.00%
Epoch 19, 

In [None]:
print(data['y'].value_counts())  # Verify class distribution