In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os

FILE_PATH = "../data/raw/restructured_data.xlsx"
OUTPUT_PATH = "../output/training_data.npy"
SEQ_LEN = 30

FEATURE_COLS = [
    'total_electricity_kwh', 'total_water_m3', 'total_natural_gas_kwh',
    'occupied_rooms', 'total_guests', 'occupancy_rate',
    'arrival_pax', 'departure_pax',
    'feels_like_avg', 'temp_avg'
]

def create_cyclical_features(df, col_name, max_val):
    df[col_name + '_sin'] = np.sin(2 * np.pi * df[col_name] / max_val)
    df[col_name + '_cos'] = np.cos(2 * np.pi * df[col_name] / max_val)
    return df

def preprocess():
    if not os.path.exists(FILE_PATH):
        print(f"FILE PATH NOT FOUND")
        return

    all_sheets = pd.read_excel(FILE_PATH, sheet_name=None)
    
    all_sequences = []
    
    for sheet_name, df in all_sheets.items():
        print(f"processing {sheet_name} | shape: {df.shape}")
   
        #Missing occupancy data, excluded from training
        if sheet_name == "Otel4":
            continue
            
        df['date'] = pd.to_datetime(df['date'])
        df = df.sort_values('date').reset_index(drop=True)
   
        df[FEATURE_COLS] = df[FEATURE_COLS].interpolate(method='linear').fillna(method='bfill')

        df['day_of_week'] = df['date'].dt.dayofweek
        df['month'] = df['date'].dt.month
        
        df = create_cyclical_features(df, 'day_of_week', 6)
        df = create_cyclical_features(df, 'month', 12)

        selected_cols = FEATURE_COLS + ['day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos']
        data_values = df[selected_cols].values
        
        scaler = MinMaxScaler()
        data_scaled = scaler.fit_transform(data_values)
  
        for i in range(len(data_scaled) - SEQ_LEN):
            all_sequences.append(data_scaled[i : i + SEQ_LEN])

    final_data = np.array(all_sequences)
    
    np.random.shuffle(final_data)
    
    if not os.path.exists("../output"):
        os.makedirs("../output")
    
    np.save(OUTPUT_PATH, final_data)
    
    print("\n---COMPLETED---\n")
    print(f"Training Data Shape: {final_data.shape}")
    print(f"Samples (Number of Examples): {final_data.shape[0]}")
    print(f"Time Steps (Window Size): {final_data.shape[1]}")
    print(f"Features (Number of Features): {final_data.shape[2]}")
    print(f"Saved to: {OUTPUT_PATH}")

if __name__ == "__main__":
    preprocess()

processing Otel1 | shape: (835, 14)
processing Otel2 | shape: (689, 14)
processing Otel3 | shape: (882, 14)
processing Otel4 | shape: (836, 14)

---COMPLETED---

Training Data Shape: (2316, 30, 14)
Samples (Number of Examples): 2316
Time Steps (Window Size): 30
Features (Number of Features): 14
Saved to: ../output/training_data.npy


  df[FEATURE_COLS] = df[FEATURE_COLS].interpolate(method='linear').fillna(method='bfill')
  df[FEATURE_COLS] = df[FEATURE_COLS].interpolate(method='linear').fillna(method='bfill')
  df[FEATURE_COLS] = df[FEATURE_COLS].interpolate(method='linear').fillna(method='bfill')
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
