In [17]:
import pandas as pd

# Load the data from the uploaded CSV file
file_path = 'feature_nans_removed.csv'
data = pd.read_csv(file_path)

# Remove the unused 'Unnamed: 0' column and ensure no NaN values in 'mood'
data = data.drop(columns=['Unnamed: 0']).dropna(subset=['mood'])


In [18]:
from sklearn.model_selection import train_test_split
import numpy as np

# Define a function to create variable-length sequences
def create_variable_sequences(df, max_steps):
    X, y = [], []
    for i in range(len(df)):
        # Calculate start index for the variable-length sequence
        start_ix = max(0, i - max_steps + 1)
        # Gather input and output parts of the pattern
        seq_x, seq_y = df.iloc[start_ix:i+1, 2:].values, df.iloc[i, 3]  # mood is at index 3
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X, dtype=object), np.array(y)

# Creating variable-length sequences using up to a 7-day history
max_steps = 7
X_var, y_var = create_variable_sequences(data, max_steps)

# Splitting the dataset into training and testing sets for variable-length sequences
X_train_var, X_test_var, y_train_var, y_test_var = train_test_split(X_var, y_var, test_size=0.2, random_state=42)

# Check the shapes and types of the resulting arrays
(X_train_var.shape, type(X_train_var[0]), X_test_var.shape, type(X_test_var[0])), (y_train_var.shape, y_test_var.shape)


(((1014,), numpy.ndarray, (254,), numpy.ndarray), ((1014,), (254,)))

In [19]:
# Converting daily mood scores to integer classes
data['mood'] = data['mood'].round().astype(int)

# Updating the mood targets in the variable-length sequences
def update_targets_with_classes(X, y, df):
    # Map mood scores to integer classes
    mood_classes = df['mood'].round().astype(int).values
    
    # Ensure only the labels corresponding to the sequences are used
    y_updated = []
    for i, seq in enumerate(X):
        # Get the index of the last item in the sequence for the label
        label_index = len(seq) - 1 + i
        if label_index < len(mood_classes):
            y_updated.append(mood_classes[label_index])
    return X, np.array(y_updated)

# Apply the updated targets to the previously created variable-length sequences
X_train_var, y_train_var_updated = update_targets_with_classes(X_train_var, y_train_var, data)
X_test_var, y_test_var_updated = update_targets_with_classes(X_test_var, y_test_var, data)

# Verify the updated target shapes and ensure they match the number of sequences
print(f"Updated y_train_var_updated shape: {y_train_var_updated.shape}")
print(f"Updated y_test_var_updated shape: {y_test_var_updated.shape}")

# Function to replace NaNs with zero in each sequence
def replace_nans(X):
    return [np.nan_to_num(np.array(seq, dtype=float), nan=0.0) for seq in X]

# Replace NaNs in X_train_var and X_test_var
X_train_var = replace_nans(X_train_var)
X_test_var = replace_nans(X_test_var)

# Verify the updated target shapes and unique classes
y_train_var_updated.shape, y_test_var_updated.shape, np.unique(y_train_var_updated), np.unique(y_test_var_updated)


Updated y_train_var_updated shape: (1014,)
Updated y_test_var_updated shape: (254,)


((1014,), (254,), array([3, 4, 5, 6, 7, 8, 9]), array([4, 5, 6, 7, 8, 9]))

In [20]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Setting the number of LSTM units
num_lstm_units = 100

# Define the RNN Model
class MoodRNN(nn.Module):
    def __init__(self):
        super(MoodRNN, self).__init__()
        self.lstm = nn.LSTM(input_size=12, hidden_size=num_lstm_units, batch_first=True)
        self.classifier = nn.Linear(num_lstm_units, 7)  # Output layer for 7 classes (moods 3 to 9)

    def forward(self, x):
        # Forward pass through LSTM layer
        lstm_out, _ = self.lstm(x)
        # Only take the output from the final timestep
        last_time_step = lstm_out[:, -1, :]
        # Pass the last timestep output to classifier to get the mood prediction
        mood_prediction = self.classifier(last_time_step)
        return mood_prediction


print(f"X_train_var shape: {np.array([len(x) for x in X_train_var]).shape}")
print(f"y_train_var_updated shape: {y_train_var_updated.shape}")

print(f"X_test_var shape: {np.array([len(x) for x in X_test_var]).shape}")
print(f"y_test_var_updated shape: {y_test_var_updated.shape}")


# Function to create data loaders with padding
def create_padded_loader(X, y, batch_size):
    # Pad sequences to the same length
    X_padded = nn.utils.rnn.pad_sequence([torch.tensor(x, dtype=torch.float32) for x in X], batch_first=True)
    y_tensor = torch.tensor(y, dtype=torch.long) - 3  # Subtract 3 to map mood 3-9 to 0-6
    dataset = TensorDataset(X_padded, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Create data loaders
batch_size = 32
train_loader = create_padded_loader(X_train_var, y_train_var_updated, batch_size)
test_loader = create_padded_loader(X_test_var, y_test_var_updated, batch_size)

# Example of training loop setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MoodRNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
model.train()
for epoch in range(10):  # Number of epochs
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Testing loop
model.eval()
with torch.no_grad():
    correct, total = 0, 0
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()
    print(f'Accuracy: {100 * correct / total}%')


X_train_var shape: (1014,)
y_train_var_updated shape: (1014,)
X_test_var shape: (254,)
y_test_var_updated shape: (254,)
Epoch 1, Loss: 1.3717297315597534
Epoch 2, Loss: 0.9705755114555359
Epoch 3, Loss: 1.0675801038742065
Epoch 4, Loss: 1.2877821922302246
Epoch 5, Loss: 1.4800549745559692
Epoch 6, Loss: 1.0224374532699585
Epoch 7, Loss: 1.710258960723877
Epoch 8, Loss: 1.305217981338501
Epoch 9, Loss: 1.160085678100586
Epoch 10, Loss: 1.3078067302703857
Accuracy: 46.8503937007874%


In [22]:
# Check for NaNs in each sequence of X_train_var
nan_in_X_train_var = any(np.isnan(np.array(seq, dtype=float)).any() for seq in X_train_var)
print("NaN in X_train_var:", nan_in_X_train_var)

# Since y_train_var_updated should be a standard numpy array, checking for NaNs directly should work
nan_in_y_train_var = np.isnan(y_train_var_updated).any()
print("NaN in y_train_var:", nan_in_y_train_var)


NaN in X_train_var: False
NaN in y_train_var: False
