In [126]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# Load the data
data = pd.read_csv('feature_nans_removed.csv')
data = data.drop(columns=['Unnamed: 0']).dropna(subset=['mood'])

# Function to categorize mood into 5 bins
def categorize_mood(mood_scores):
    bins = [2.5, 4.5, 6.5, 7.5, 8.5, 10] # Adjust bins?
    return pd.cut(mood_scores, bins=bins, labels=[0, 1, 2, 3, 4], right=False)

# Apply categorization
data['mood'] = categorize_mood(data['mood'])

data = data.dropna()


In [127]:
from sklearn.model_selection import train_test_split
import numpy as np

# Define a function to create variable-length sequences
def create_variable_sequences(df, max_steps):
    X, y = [], []
    for i in range(len(df)):
        # Calculate start index for the variable-length sequence
        start_ix = max(0, i - max_steps + 1)
        # Gather input and output parts of the pattern
        seq_x, seq_y = df.iloc[start_ix:i+1, 2:].values, df.iloc[i, 3]  # mood is at index 3
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X, dtype=object), np.array(y)

# Creating variable-length sequences using up to a 7-day history
max_steps = 7
X_var, y_var = create_variable_sequences(data, max_steps)

# Splitting the dataset into training and testing sets for variable-length sequences
X_train_var, X_test_var, y_train_var, y_test_var = train_test_split(X_var, y_var, test_size=0.2, random_state=42)

# Function to remove sequences containing NaNs
def remove_sequences_with_nans(sequences):
    return [seq for seq in sequences if not np.isnan(np.array(seq)).any()]

# Remove problematic sequences
X_train_var_clean = remove_sequences_with_nans(X_train_var)
y_train_var_clean = [y_train_var[i] for i in range(len(X_train_var)) if i not in [17]] 

X_test_var_clean = remove_sequences_with_nans(X_test_var)
y_test_var_clean = [y_test_var[i] for i in range(len(X_test_var)) if i not in [5]] 

In [128]:
# Ensure the update_targets_with_classes function uses the existing categorical labels directly
def update_targets_with_classes(X, y, df):
    # Mood classes are already categorized as integers (labels 0 to 4)
    mood_classes = df['mood'].values
    
    # Ensure only the labels corresponding to the sequences are used
    y_updated = []
    for i, seq in enumerate(X):
        # Get the index of the last item in the sequence for the label
        label_index = len(seq) - 1 + i
        if label_index < len(mood_classes):
            y_updated.append(mood_classes[label_index])
    return X, np.array(y_updated)

# Apply the updated targets to the previously created variable-length sequences
X_train_var, y_train_var_updated = update_targets_with_classes(X_train_var, y_train_var, data)
X_test_var, y_test_var_updated = update_targets_with_classes(X_test_var, y_test_var, data)

# Print updated shapes of targets and check for consistency and correct mappings
print(f"Updated y_train_var_updated shape: {y_train_var_updated.shape}")
print(f"Updated y_test_var_updated shape: {y_test_var_updated.shape}")

# Function to replace NaNs with zero in each sequence, which is already defined, so applying it here
X_train_var_clean = replace_nans(X_train_var)
X_test_var_clean = replace_nans(X_test_var)

# Check the shapes and unique classes in the updated target arrays to ensure correct application
print(f"Training labels shape and unique classes: {y_train_var_updated.shape}, {np.unique(y_train_var_updated)}")
print(f"Testing labels shape and unique classes: {y_test_var_updated.shape}, {np.unique(y_test_var_updated)}")


Updated y_train_var_updated shape: (918,)
Updated y_test_var_updated shape: (230,)
Training labels shape and unique classes: (918,), [0 1 2 3 4]
Testing labels shape and unique classes: (230,), [0 1 2 3 4]


In [129]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Setting the number of LSTM units
num_lstm_units = 100

# Define the RNN Model
class MoodRNN(nn.Module):
    def __init__(self):
        super(MoodRNN, self).__init__()
        self.lstm = nn.LSTM(input_size=12, hidden_size=num_lstm_units, batch_first=True)
        self.classifier = nn.Linear(num_lstm_units, 5)  # Output layer for 5 classes

    def forward(self, x):
        # Forward pass through LSTM layer
        lstm_out, _ = self.lstm(x)
        # Only take the output from the final timestep
        last_time_step = lstm_out[:, -1, :]
        # Pass the last timestep output to classifier to get the mood prediction
        mood_prediction = self.classifier(last_time_step)
        return mood_prediction

# Print array shapes to confirm correct setup
print(f"X_train_var shape: {np.array([len(x) for x in X_train_var]).shape}")
print(f"y_train_var_updated shape: {y_train_var_updated.shape}")
print(f"X_test_var shape: {np.array([len(x) for x in X_test_var]).shape}")
print(f"y_test_var_updated shape: {y_test_var_updated.shape}")

# Function to create data loaders with padding
def create_padded_loader(X, y, batch_size):
    # Pad sequences to the same length
    X_padded = nn.utils.rnn.pad_sequence([torch.tensor(x, dtype=torch.float32) for x in X], batch_first=True)
    y_tensor = torch.tensor(y, dtype=torch.long)
    dataset = TensorDataset(X_padded, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Create data loaders
batch_size = 32
train_loader = create_padded_loader(X_train_var, y_train_var_updated, batch_size)
test_loader = create_padded_loader(X_test_var, y_test_var_updated, batch_size)

# Example of training loop setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MoodRNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Re-initialize model to reset any corrupted weights
model = MoodRNN().to(device)

# Adding weight initialization
for name, param in model.named_parameters():
    if 'bias' in name:
        nn.init.constant_(param, 0.0)
    elif 'weight' in name:
        nn.init.xavier_uniform_(param)

# Training loop with additional diagnostics
model.train()
for epoch in range(10):
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        if torch.isnan(loss):
            print(f"NaN loss detected at epoch {epoch+1}")
            break
        loss.backward()
        optimizer.step()
    if torch.isnan(loss):
        break
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Evaluate model again after adjustments
model.eval()
all_predictions = []
all_labels = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs.data, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

# Calculate precision, recall, and F1-score
from sklearn.metrics import classification_report
print(classification_report(all_labels, all_predictions, target_names=[f'Mood {i}' for i in range(5)]))


X_train_var shape: (918,)
y_train_var_updated shape: (918,)
X_test_var shape: (230,)
y_test_var_updated shape: (230,)
Epoch 1, Loss: 1.401260256767273
Epoch 2, Loss: 1.3616002798080444
Epoch 3, Loss: 1.587344765663147
Epoch 4, Loss: 1.4762881994247437
Epoch 5, Loss: 1.3855551481246948
Epoch 6, Loss: 1.4724159240722656
Epoch 7, Loss: 1.4539262056350708
Epoch 8, Loss: 1.5380122661590576
Epoch 9, Loss: 1.4402896165847778
Epoch 10, Loss: 1.2989252805709839
              precision    recall  f1-score   support

      Mood 0       0.00      0.00      0.00         4
      Mood 1       0.00      0.00      0.00        35
      Mood 2       0.51      0.70      0.59       122
      Mood 3       0.17      0.01      0.03        68
      Mood 4       0.00      0.00      0.00         1

    accuracy                           0.37       230
   macro avg       0.13      0.14      0.12       230
weighted avg       0.32      0.37      0.32       230



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
