In [28]:
import pandas as pd

# Load the data from the uploaded CSV file
file_path = 'feature_nans_removed.csv'
data = pd.read_csv(file_path)

# Remove the unused 'Unnamed: 0' column and ensure no NaN values in 'mood'
data = data.drop(columns=['Unnamed: 0']).dropna(subset=['mood'])


In [29]:
from sklearn.model_selection import train_test_split
import numpy as np

# Define a function to create variable-length sequences
def create_variable_sequences(df, max_steps):
    X, y = [], []
    for i in range(len(df)):
        # Calculate start index for the variable-length sequence
        start_ix = max(0, i - max_steps + 1)
        # Gather input and output parts of the pattern
        seq_x, seq_y = df.iloc[start_ix:i+1, 2:].values, df.iloc[i, 3]  # mood is at index 3
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X, dtype=object), np.array(y)

# Creating variable-length sequences using up to a 7-day history
max_steps = 7
X_var, y_var = create_variable_sequences(data, max_steps)

# Splitting the dataset into training and testing sets for variable-length sequences
X_train_var, X_test_var, y_train_var, y_test_var = train_test_split(X_var, y_var, test_size=0.2, random_state=42)

# Check the shapes and types of the resulting arrays
(X_train_var.shape, type(X_train_var[0]), X_test_var.shape, type(X_test_var[0])), (y_train_var.shape, y_test_var.shape)


(((1014,), numpy.ndarray, (254,), numpy.ndarray), ((1014,), (254,)))

In [30]:
# Converting daily mood scores directly to float
data['mood'] = data['mood'].astype(float)

# Updating the mood targets in the variable-length sequences for regression
def update_targets_for_regression(X, y, df):
    # Mood scores as continuous values
    mood_scores = df['mood'].values
    
    # Ensure only the labels corresponding to the sequences are used
    y_updated = []
    for i, seq in enumerate(X):
        # Get the index of the last item in the sequence for the label
        label_index = len(seq) - 1 + i
        if label_index < len(mood_scores):
            y_updated.append(mood_scores[label_index])
    return X, np.array(y_updated)

# Apply the updated targets to the variable-length sequences
X_train_var, y_train_var_updated = update_targets_for_regression(X_train_var, y_train_var, data)
X_test_var, y_test_var_updated = update_targets_for_regression(X_test_var, y_test_var, data)


# Verify the updated target shapes and ensure they match the number of sequences
print(f"Updated y_train_var_updated shape: {y_train_var_updated.shape}")
print(f"Updated y_test_var_updated shape: {y_test_var_updated.shape}")

# Function to replace NaNs with zero in each sequence
def replace_nans(X):
    return [np.nan_to_num(np.array(seq, dtype=float), nan=0.0) for seq in X]

# Replace NaNs in X_train_var and X_test_var
X_train_var = replace_nans(X_train_var)
X_test_var = replace_nans(X_test_var)

# Verify the updated target shapes and unique classes
y_train_var_updated.shape, y_test_var_updated.shape, np.unique(y_train_var_updated), np.unique(y_test_var_updated)


Updated y_train_var_updated shape: (1014,)
Updated y_test_var_updated shape: (254,)


((1014,),
 (254,),
 array([3.        , 3.33333333, 3.5       , 3.75      , 3.8       ,
        4.33333333, 4.4       , 4.5       , 4.66666667, 4.75      ,
        4.8       , 5.        , 5.2       , 5.25      , 5.33333333,
        5.4       , 5.5       , 5.6       , 5.66666667, 5.75      ,
        5.8       , 6.        , 6.16666667, 6.2       , 6.25      ,
        6.33333333, 6.4       , 6.5       , 6.6       , 6.66666667,
        6.75      , 6.8       , 6.83333333, 7.        , 7.16666667,
        7.2       , 7.25      , 7.33333333, 7.4       , 7.5       ,
        7.6       , 7.66666667, 7.75      , 7.8       , 7.83333333,
        8.        , 8.16666667, 8.2       , 8.25      , 8.33333333,
        8.4       , 8.5       , 8.6       , 8.75      , 8.8       ,
        9.        , 9.33333333]),
 array([3.5       , 3.75      , 4.33333333, 4.4       , 4.5       ,
        5.        , 5.25      , 5.33333333, 5.4       , 5.5       ,
        5.66666667, 5.75      , 5.8       , 6.        , 6.2    

In [31]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Setting the number of LSTM units
num_lstm_units = 100

class MoodRNN(nn.Module):
    def __init__(self):
        super(MoodRNN, self).__init__()
        self.lstm = nn.LSTM(input_size=12, hidden_size=num_lstm_units, batch_first=True)
        self.regressor = nn.Linear(num_lstm_units, 1)  # Output layer for regression

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last_time_step = lstm_out[:, -1, :]
        mood_prediction = self.regressor(last_time_step)
        return mood_prediction

# Function to create data loaders with padding for regression
def create_padded_loader(X, y, batch_size):
    X_padded = nn.utils.rnn.pad_sequence([torch.tensor(x, dtype=torch.float32) for x in X], batch_first=True)
    y_tensor = torch.tensor(y, dtype=torch.float32)
    dataset = TensorDataset(X_padded, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Create data loaders
batch_size = 32
train_loader = create_padded_loader(X_train_var, y_train_var_updated, batch_size)
test_loader = create_padded_loader(X_test_var, y_test_var_updated, batch_size)

# Setup device and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MoodRNN().to(device)
criterion_mse = nn.MSELoss()
criterion_mae = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
model.train()
for epoch in range(10):  # Number of epochs
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion_mse(outputs, y_batch)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Testing loop for regression
model.eval()
total_loss_mse = 0
total_loss_mae = 0
count = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch).squeeze()  
        loss_mse = criterion_mse(outputs, y_batch) 
        loss_mae = criterion_mae(outputs, y_batch)  
        total_loss_mse += loss_mse.item() * y_batch.size(0) 
        total_loss_mae += loss_mae.item() * y_batch.size(0) 
        count += y_batch.size(0)

    mean_loss_mse = total_loss_mse / count
    mean_loss_mae = total_loss_mae / count
    print(f'Mean Squared Error: {mean_loss_mse}')
    print(f'Root Mean Squared Error: {mean_loss_mse ** 0.5}')
    print(f'Mean Absolute Error: {mean_loss_mae}')


Epoch 1, Loss: 25.95891571044922
Epoch 2, Loss: 10.23878002166748
Epoch 3, Loss: 2.7501864433288574
Epoch 4, Loss: 0.6570164561271667
Epoch 5, Loss: 0.42089998722076416
Epoch 6, Loss: 0.9466493725776672
Epoch 7, Loss: 0.6528249979019165
Epoch 8, Loss: 1.11165452003479
Epoch 9, Loss: 0.5875440239906311
Epoch 10, Loss: 0.678849995136261
Mean Squared Error: 0.7788128411676002
Root Mean Squared Error: 0.8825037343646769
Mean Absolute Error: 0.6469647720104127
