In [None]:
import pandas as pd
import matplotlib.pyplot as plt

filepath = '../data/preprocessed/engineered_data.csv'
data_cleaned = pd.read_csv(filepath)

# Sort the data by 'day'
data_cleaned['day'] = pd.to_datetime(data_cleaned['day'])
data_cleaned.sort_values(by='day', inplace=True)

# Determine the cut-off day for the 80:20 split
cutoff = int(len(data_cleaned) * 0.8)

# Split the data into training and testing sets
train_data = data_cleaned.iloc[:cutoff]
test_data = data_cleaned.iloc[cutoff:]

train_data.sort_values(by=['id', 'day'], inplace=True)
test_data.sort_values(by=['id', 'day'], inplace=True)

train_data.to_csv('../data/preprocessed/train_classification.csv', index=False)
test_data.to_csv('../data/preprocessed/test_classification.csv', index=False)

# Plotting to visually inspect the train-test split
plt.figure(figsize=(14, 6))
plt.scatter(train_data['day'], train_data['mood'], label='Train Data', color='blue', alpha=0.5)
plt.scatter(test_data['day'], test_data['mood'], label='Test Data', color='red', alpha=0.5)
plt.title('Train-Test Split', fontsize=20)
plt.xlabel('Date', fontsize=18)
plt.xticks(fontsize=14)
plt.ylabel('Mood Score', fontsize=18)
plt.yticks(fontsize=14)
plt.legend(fontsize=16)
plt.show()