In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [2]:
df_full = pd.read_csv('random subset of data files/NP_ENG_EOC_FULL.csv')
df = df_full.iloc[0:6000]
df_test = df_full.iloc[6001:9238]

columns = list(df.columns)
columns.remove('EOC')
columns.append('EOC')

# Reassign the DataFrame to follow the new column order
df = df[columns]
df

Unnamed: 0.1,Unnamed: 0,student_id,chapter_number,num_passed,engaged,length,EOC
0,0,001824fb-a2fd-431d-aef6-7a1250d97a62,1.0,24,5663330.0,131,0.921053
1,1,001824fb-a2fd-431d-aef6-7a1250d97a62,2.0,26,30902762.0,91,0.839080
2,2,001824fb-a2fd-431d-aef6-7a1250d97a62,3.0,29,34127489.0,65,0.769231
3,3,001824fb-a2fd-431d-aef6-7a1250d97a62,5.0,12,15971898.0,71,0.777778
4,4,001824fb-a2fd-431d-aef6-7a1250d97a62,6.0,19,17611781.0,73,0.697917
...,...,...,...,...,...,...,...
5995,5995,a3ac1e2d-7dc6-4ff1-a8df-bb6ff0f52e67,12.0,14,6230812.0,40,0.560000
5996,5996,a41d13e7-0e8b-402c-be3c-26313c7d28ce,1.0,24,5341326.0,47,0.945946
5997,5997,a41d13e7-0e8b-402c-be3c-26313c7d28ce,2.0,26,11551124.0,88,0.879518
5998,5998,a41d13e7-0e8b-402c-be3c-26313c7d28ce,3.0,29,10541477.0,115,0.816327


In [3]:
# Assuming 'df' is your pandas DataFrame with the data
# Step 1: Preprocessing
# Separate features and labels
X = df[['num_passed', 'engaged']].values  # Features
y = df['EOC'].values  # Labels (EOC accuracy)
y_tensor = torch.tensor(y, dtype=torch.float32)

In [4]:
# Split into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
#print(y_test_tensor)

In [6]:
# Step 2: Model Definition
class RegressionModel(nn.Module):
    def __init__(self):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(2, 50)  # 2 input features
        self.fc2 = nn.Linear(50, 1)  # Output 1 value (EOC accuracy)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [7]:
# Instantiate the model
model = RegressionModel()

# Step 3: Training
criterion = nn.MSELoss()  # Mean Squared Error Loss
optimizer = optim.Adam(model.parameters(), lr=0.01)  # Adam optimizer

In [8]:
# Training loop
for epoch in range(1000):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs.squeeze(), y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 0.8044472932815552
Epoch 101, Loss: 0.030375290662050247
Epoch 201, Loss: 0.02834305353462696
Epoch 301, Loss: 0.027546482160687447
Epoch 401, Loss: 0.027247285470366478
Epoch 501, Loss: 0.02707507833838463
Epoch 601, Loss: 0.026948878541588783
Epoch 701, Loss: 0.02682209573686123
Epoch 801, Loss: 0.02670600824058056
Epoch 901, Loss: 0.02662145346403122


In [9]:
# Step 4: Evaluation
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Turn off gradients for validation
    predicted = model(X_test_tensor)
    loss = criterion(predicted.squeeze(), y_test_tensor)
print(f'Test Loss: {loss.item()}')

Test Loss: 0.02559182234108448


In [10]:
#create data
#new_df = pd.read_csv('Random subset of data files/NP_ENG_EOC.csv')
new_df = pd.read_csv('Random subset of data files/NP_ENG_EOC_FULL.csv')

columns = list(new_df.columns)
columns.remove('EOC')
columns.append('EOC')

# Reassign the DataFrame to follow the new column order
new_df = new_df[columns]

In [11]:
y = new_df['EOC'].values  # Labels (EOC accuracy)
y_tensor = torch.tensor(y, dtype=torch.float32)

In [12]:
# Step 5: Prediction
# Let's say you have new data in 'new_data' DataFrame
new_X = new_df[['num_passed', 'engaged']].values
new_X = scaler.transform(new_X)  # Normalize the new data
new_X_tensor = torch.tensor(new_X, dtype=torch.float32)

model.eval()  # Make sure the model is in eval mode
with torch.no_grad():  # Turn off gradients for prediction
    predictions = model(new_X_tensor)
    loss2 = criterion(predictions.squeeze(), y_tensor)
print(f'Test Loss 2: {loss2.item()}')

Test Loss 2: 0.02587650902569294


In [13]:
#Visualize the predicted vs actual values

# Convert the tensor to a NumPy array and then to a pandas Series
np_preds = pd.Series(predictions[:,0].numpy())

new_df['Pred_EOC'] = np_preds

#new_df

In [14]:
#calculate the percentage error: (predicted - actual) / actual
new_df_no0s = new_df[new_df['EOC'] != 0]

new_df_no0s['perc_error'] = (   abs(new_df_no0s['Pred_EOC'] - new_df_no0s['EOC'])   ) / new_df_no0s['EOC']
new_df_no0s

Unnamed: 0.1,Unnamed: 0,student_id,chapter_number,num_passed,engaged,length,EOC,Pred_EOC,perc_error
0,0,001824fb-a2fd-431d-aef6-7a1250d97a62,1.0,24,5663330.0,131,0.921053,0.784048,0.148748
1,1,001824fb-a2fd-431d-aef6-7a1250d97a62,2.0,26,30902762.0,91,0.839080,0.673861,0.196905
2,2,001824fb-a2fd-431d-aef6-7a1250d97a62,3.0,29,34127489.0,65,0.769231,0.590062,0.232919
3,3,001824fb-a2fd-431d-aef6-7a1250d97a62,5.0,12,15971898.0,71,0.777778,0.722797,0.070689
4,4,001824fb-a2fd-431d-aef6-7a1250d97a62,6.0,19,17611781.0,73,0.697917,0.666611,0.044856
...,...,...,...,...,...,...,...,...,...
9234,9234,ff6ec9fe-de1d-4b45-8136-59465d9c85ab,6.0,19,16650582.0,56,0.500000,0.663463,0.326925
9235,9235,ff6ec9fe-de1d-4b45-8136-59465d9c85ab,9.0,13,14206097.0,91,0.611650,0.712343,0.164624
9236,9236,ff6ec9fe-de1d-4b45-8136-59465d9c85ab,10.0,12,8793743.0,40,0.469697,0.669946,0.426336
9237,9237,ff6ec9fe-de1d-4b45-8136-59465d9c85ab,11.0,16,10646661.0,54,0.518519,0.675364,0.302488


In [15]:
new_df_no0s['perc_error'].mean()

0.2573226272473809