In [None]:
import pandas as pd

# Load dataset
file_path = "data.csv"  # Replace with the actual file path
df = pd.read_csv(file_path)

# Convert timestamp column to datetime format
df['timestamp'] = pd.to_datetime(df['timestamp'], dayfirst=True, errors='coerce')

# Drop rows with missing timestamps
df = df.dropna(subset=['timestamp'])

# Sort the dataframe by Bin ID and timestamp
df = df.sort_values(by=['Bin ID', 'timestamp'])

# Remove duplicate rows if any
df = df.drop_duplicates()

# Filter out rows where reason is 'NOT_READY' (if not useful for forecasting)
df = df[df['reason'] != 'NOT_READY']

# Extract useful time-based features
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day
df['day_of_week'] = df['timestamp'].dt.dayofweek

# Compute the change in fullness level (trend indicator) but **DO NOT normalize fullness values**
df['fullness_change'] = df.groupby('Bin ID')['Fullness'].diff().fillna(0)

# Save the cleaned dataset
cleaned_file_path = "cleaned_bin_data_no_scaling.csv"
df.to_csv(cleaned_file_path, index=False)

# Display success message
print(f"Cleaned dataset saved as: {cleaned_file_path}")

# Display the first few rows
print(df.head())


Cleaned dataset saved as: cleaned_bin_data_no_scaling.csv
       Bin ID  Fullness  fullnessThreshold  timestamp    reason  year  month  \
64    1510830         6                  6 2018-06-26  FULLNESS  2018      6   
160   1510830         8                  6 2018-06-29  FULLNESS  2018      6   
192   1510830         6                  6 2018-06-30  FULLNESS  2018      6   
424   1510830         6                  6 2018-07-06  FULLNESS  2018      7   
1336  1510830         6                  6 2018-07-11  FULLNESS  2018      7   

      day  day_of_week  fullness_change  
64     26            1              0.0  
160    29            4              2.0  
192    30            5             -2.0  
424     6            4              0.0  
1336   11            2              0.0  


: 