In [None]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit

data = pd.read_csv('../data/preprocessed/engineered_data.csv')

# # Round mood values to discrete values of 1-10
# data['mood'] = data['mood'].round().astype(int)
# data['mood'] = data['mood'].clip(lower=1, upper=10)

data = data.sort_values(by=['id', 'day'])
groups = data['id']

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Split the dataset
for train_idx, test_idx in gss.split(data, groups=groups):
    train_set = data.iloc[train_idx]
    test_set = data.iloc[test_idx]

train_set.to_csv('../data/preprocessed/train_set.csv', index=False)
test_set.to_csv('../data/preprocessed/test_set.csv', index=False)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

filepath = '../data/preprocessed/new_engineered_data.csv' 
data = pd.read_csv(filepath)

data['day'] = pd.to_datetime(data['day'])

# Split the data based on dates to prevent leakage
unique_dates = data['day'].unique()
train_dates, test_dates = train_test_split(unique_dates, test_size=0.2, random_state=42)
train_data = data[data['day'].isin(train_dates)]
test_data = data[data['day'].isin(test_dates)]

# Separate features and target for normalization
feature_columns = [col for col in train_data.columns if col not in ['mood']]  # Exclude only mood column
train_features = train_data[feature_columns]
test_features = test_data[feature_columns]

train_target = train_data['mood']
test_target = test_data['mood']

# Normalize the training data features excluding 'id' and 'day'
columns_to_normalize = [col for col in train_features.columns if col not in ['id', 'day', 'time']]
scaler = StandardScaler()
train_features[columns_to_normalize] = scaler.fit_transform(train_features[columns_to_normalize])
test_features[columns_to_normalize] = scaler.transform(test_features[columns_to_normalize])  # Use the same scaler

# Concatenate the normalized features with their respective targets
train_final = pd.concat([train_features.reset_index(drop=True), train_target.reset_index(drop=True)], axis=1)
test_final = pd.concat([test_features.reset_index(drop=True), test_target.reset_index(drop=True)], axis=1)

train_final.to_csv('train_final.csv', index=False)
test_final.to_csv('test_final.csv', index=False)
