In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [2]:
#split df_full into df for training/testing and new_df for evaluation
df_full = pd.read_csv('random subset of data files/NP_ENG_EOC_FULL.csv')
df = df_full.iloc[0:8000]
new_df = df_full.iloc[8001:9238]
new_df = new_df.reset_index(drop=True)

In [3]:
#Reorganize columns df
columns = list(df.columns)
columns.remove('EOC')
columns.append('EOC')

# Reassign the DataFrame to follow the new column order
df = df[columns]

In [4]:
# Assuming 'df' is your pandas DataFrame with the data
# Step 1: Preprocessing
# Separate features and labels
X = df[['num_passed', 'engaged', 'length']].values  # Features
y = df['EOC'].values  # Labels (EOC accuracy)
y_tensor = torch.tensor(y, dtype=torch.float32)

In [5]:
# Split into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [7]:
# Step 2: Model Definition
class RegressionModel(nn.Module):
    def __init__(self):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(3, 50)  # 3 input features
        self.fc2 = nn.Linear(50, 1)  # Output 1 value (EOC accuracy)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [8]:
# Instantiate the model
model = RegressionModel()

# Step 3: Training
criterion = nn.MSELoss()  # Mean Squared Error Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer

In [9]:
# Training loop
for epoch in range(1000):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs.squeeze(), y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 0.5349588990211487
Epoch 101, Loss: 0.0530269518494606
Epoch 201, Loss: 0.034562185406684875
Epoch 301, Loss: 0.029647935181856155
Epoch 401, Loss: 0.02824144996702671
Epoch 501, Loss: 0.02759462781250477
Epoch 601, Loss: 0.027220698073506355
Epoch 701, Loss: 0.02695312909781933
Epoch 801, Loss: 0.02673872746527195
Epoch 901, Loss: 0.02654481679201126


In [10]:
# Step 4: Evaluation
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Turn off gradients for validation
    predicted = model(X_test_tensor)
    loss = criterion(predicted.squeeze(), y_test_tensor)
print(f'Test Loss: {loss.item()}')

Test Loss: 0.027308469638228416


In [11]:
# Step 4.5: Evaluate on new_df

#reassign columns of new_df
columns = list(new_df.columns)
columns.remove('EOC')
columns.append('EOC')

# Reassign the DataFrame to follow the new column order
new_df = new_df[columns]
#new_df

In [12]:
# Step 5: Prediction
# Let's say you have new data in 'new_data' DataFrame
new_X = new_df[['num_passed', 'engaged', 'length']].values
new_X = scaler.transform(new_X)  # Normalize the new data
new_X_tensor = torch.tensor(new_X, dtype=torch.float32)

new_Y = new_df[['EOC']].values
new_Y_tensor = torch.tensor(new_Y, dtype=torch.float32)

model.eval()  # Make sure the model is in eval mode
with torch.no_grad():  # Turn off gradients for prediction
    predictions = model(new_X_tensor)
    loss2 = criterion(predictions.squeeze(), new_Y_tensor)
print(f'Test Loss 2: {loss2.item()}')

Test Loss 2: 0.05086226388812065


  return F.mse_loss(input, target, reduction=self.reduction)


In [13]:
#Visualize the predicted vs actual values

# Convert the tensor to a NumPy array and then to a pandas Series
np_preds = pd.Series(predictions[:,0].numpy())
new_df['Pred_EOC'] = np_preds

new_df

Unnamed: 0.1,Unnamed: 0,student_id,chapter_number,num_passed,engaged,length,EOC,Pred_EOC
0,8001,dcac4b3f-adfe-4ce6-be41-163e05f7703e,2.0,26,8646928.0,98,0.950617,0.737392
1,8002,dcac4b3f-adfe-4ce6-be41-163e05f7703e,3.0,29,11188197.0,99,0.938776,0.756085
2,8003,dcac4b3f-adfe-4ce6-be41-163e05f7703e,5.0,12,9955761.0,84,0.868852,0.729665
3,8004,dcac4b3f-adfe-4ce6-be41-163e05f7703e,6.0,19,10342172.0,75,0.907216,0.672347
4,8005,dcac4b3f-adfe-4ce6-be41-163e05f7703e,9.0,13,3481849.0,64,0.760000,0.535543
...,...,...,...,...,...,...,...,...
1232,9233,ff6ec9fe-de1d-4b45-8136-59465d9c85ab,5.0,12,9989833.0,48,0.641791,0.665848
1233,9234,ff6ec9fe-de1d-4b45-8136-59465d9c85ab,6.0,19,16650582.0,56,0.500000,0.664800
1234,9235,ff6ec9fe-de1d-4b45-8136-59465d9c85ab,9.0,13,14206097.0,91,0.611650,0.697592
1235,9236,ff6ec9fe-de1d-4b45-8136-59465d9c85ab,10.0,12,8793743.0,40,0.469697,0.636404


In [14]:
#calculate the percentage error: (predicted - actual) / actual
new_df_no0s = new_df[new_df['EOC'] != 0]

new_df_no0s['perc_error'] = (   abs(new_df_no0s['Pred_EOC'] - new_df_no0s['EOC'])   ) / new_df_no0s['EOC']
df = new_df_no0s

In [15]:
df['perc_error'].mean()

0.24752948805150898

### Future improvements
1. Tune Hyperparameters (k fold cross validation)

For each combination of hyperparameters:

Split your dataset into k folds.
For each fold:
Set aside one fold as the validation set, and combine the remaining k-1 folds into the training set.
Train the model on the training set.
Evaluate the model on the validation set.
Calculate the average performance across all k folds.
   
3. More input dimensions
4. Different model (that is better for predicting this kind of data, not a CNN)

If you want a model that is robust, can handle non-linear patterns, and you have enough data, Random Forests might be a good starting point.
If you believe the relationship in the data is highly non-linear and complex and you have the computational resources to fine-tune the model, an SVM with a non-linear kernel might be appropriate.
If your dataset is not too large, the features have meaningful proximity relations, and computational efficiency is not a concern, you could consider KNN Regression.

It's generally a good idea to try multiple models and use cross-validation to compare their performance based on a metric like RMSE (Root Mean Square Error) or MAE (Mean Absolute Error). The model that shows the best performance on a validation set (or via cross-validation) is often chosen as the best model for that specific dataset and task.



In [1]:
#Cross Validation ChatGPT code
import numpy as np
from sklearn.model_selection import KFold
from torch.utils.data import Subset, DataLoader
from sklearn.metrics import mean_squared_error

# Assuming X and y are your features and labels as numpy arrays
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define hyperparameter grid (this is a simple example)
learning_rates = [0.001, 0.01, 0.1]
num_epochs_options = [100, 200]

best_hyperparams = None
best_validation_score = float('inf')

for lr in learning_rates:
    for epochs in num_epochs_options:
        validation_scores = []
        for train_index, val_index in kf.split(X):
            # Split data
            X_train_fold, X_val_fold = X[train_index], X[val_index]
            y_train_fold, y_val_fold = y[train_index], y[val_index]

            # Convert to PyTorch datasets, then to DataLoader if necessary
            # For simplicity, this part is omitted, but you would convert
            # the subsets to DataLoader objects here.

            # Initialize model, criterion, and optimizer with current hyperparams
            model = RegressionModel()  # Your model
            criterion = nn.MSELoss()
            optimizer = optim.Adam(model.parameters(), lr=lr)

            # Train the model on this fold
            # (Training loop is omitted for brevity)

            # Evaluate the model on the validation fold
            # (Evaluation code is omitted for brevity)
            
            # Compute validation score - e.g., mean squared error
            validation_score = mean_squared_error(y_val_true, y_val_pred)
            validation_scores.append(validation_score)

        # Compute the average validation score for this hyperparameter combination
        avg_validation_score = np.mean(validation_scores)
        if avg_validation_score < best_validation_score:
            best_validation_score = avg_validation_score
            best_hyperparams = {'lr': lr, 'epochs': epochs}

print(f"Best Hyperparameters: {best_hyperparams}")

NameError: name 'X' is not defined