In [1]:
import graphviz
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn import tree
from fpdf import FPDF
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import requests
from tqdm import tqdm
import datetime
import pickle

trashUrl = f"http://localhost:8080/api/TrashItems/dummy"
weerUrl = f"http://localhost:8080/api/Weather/"

# Load trash data
trash = pd.read_json(trashUrl)

# Convert timestamp to datetime
trash['timestamp'] = pd.to_datetime(trash['timestamp'])
trash['datum'] = trash['timestamp'].dt.date
trash['hour'] = trash['timestamp'].dt.hour
trash['min'] = trash['timestamp'].dt.minute
trash['year'] = trash['timestamp'].dt.year
trash['month'] = trash['timestamp'].dt.month
trash['day'] = trash['timestamp'].dt.day
trash['weekday'] = trash['timestamp'].dt.weekday

print("Data shape:", trash.shape)
print("Columns:", trash.columns.tolist())
print("Afval categorieën:", trash['litterType'].unique() if 'litterType' in trash.columns else "Geen litterType kolom")
trash.head()

Data shape: (19359, 12)
Columns: ['id', 'litterType', 'latitude', 'longitude', 'timestamp', 'datum', 'hour', 'min', 'year', 'month', 'day', 'weekday']
Afval categorieën: ['Papier' 'Plastic' 'Organisch' 'Glas']


Unnamed: 0,id,litterType,latitude,longitude,timestamp,datum,hour,min,year,month,day,weekday
0,6601c257-2556-4ae0-ac12-00075d62effc,Papier,51.589,4.775,2025-04-18 05:34:41,2025-04-18,5,34,2025,4,18,4
1,5b232aac-b2c9-4658-bfc0-000c6301404a,Plastic,51.589,4.775,2025-05-12 22:45:50,2025-05-12,22,45,2025,5,12,0
2,0081fa5b-f045-46cc-87e6-000cfc341d4f,Organisch,51.589,4.775,2025-06-06 22:01:32,2025-06-06,22,1,2025,6,6,4
3,ba89ef15-057f-486a-b41e-000daa1463b4,Plastic,51.589,4.775,2025-05-05 18:17:40,2025-05-05,18,17,2025,5,5,0
4,a950faaf-f429-412b-9701-000e00632d17,Plastic,51.589,4.775,2025-04-25 07:41:36,2025-04-25,7,41,2025,4,25,4


In [2]:
def get_all_weather_data():
    """Haal alle weerdata op uit je database"""
    weather_url = f"{weerUrl}"
    
    print(f"Trying to get all weather data from URL: {weather_url}")
    
    try:
        response = requests.get(weather_url)
        print(f"Response status: {response.status_code}")
        
        if response.status_code == 200:
            data = response.json()
            print(f"Retrieved {len(data) if isinstance(data, list) else 1} weather records")
            
            # Convert to DataFrame
            weather_df = pd.DataFrame(data)
            
            # Convert Timestamp to date
            if 'timestamp' in weather_df.columns:
                weather_df['datum'] = pd.to_datetime(weather_df['timestamp']).dt.date
                
                # Rename columns to match expected names
                weather_df = weather_df.rename(columns={
                    'temperature': 'temperatuur',
                    'weatherDescription': 'weersverwachting'
                })
            elif 'Timestamp' in weather_df.columns:
                weather_df['datum'] = pd.to_datetime(weather_df['Timestamp']).dt.date
                
                weather_df = weather_df.rename(columns={
                    'Temperature': 'temperatuur',
                    'WeatherDescription': 'weersverwachting'
                })
            
            # Select only needed columns
            if 'datum' in weather_df.columns:
                weather_df = weather_df[['datum', 'temperatuur', 'weersverwachting']]
                return weather_df
            else:
                print("Geen datum kolom gevonden in weather data")
                return pd.DataFrame()
        else:
            print(f"API call failed with status {response.status_code}")
            print(f"Response text: {response.text}")
    except Exception as e:
        print(f"Fout bij ophalen weerdata: {e}")
    
    return pd.DataFrame()

# Get weather data
weather_df = get_all_weather_data()

if not weather_df.empty:
    print("Weather data sample:")
    print(weather_df.head())
    
    # Merge weather data with trash data
    print(f"\nVoor merge:")
    print(f"Trash data shape: {trash.shape}")
    print(f"Weather data shape: {weather_df.shape}")
    
    # Ensure compatible date types
    trash['datum'] = pd.to_datetime(trash['datum']).dt.date if trash['datum'].dtype == 'object' else trash['datum']
    
    # Merge the data
    trash = pd.merge(trash, weather_df, on='datum', how='left')
    
    print(f"\nNa merge:")
    print(f"Trash data shape: {trash.shape}")
    print(f"Missing temperatuur: {trash['temperatuur'].isna().sum()}")
    print(f"Missing weersverwachting: {trash['weersverwachting'].isna().sum()}")
    
    print("\nSample van merged data:")
    print(trash[['datum', 'temperatuur', 'weersverwachting']].head())
else:
    print("Geen weerdata ontvangen, gebruik dummy data")
    # Add dummy weather data
    trash['temperatuur'] = 15.0
    trash['weersverwachting'] = 'Bewolkt'

Trying to get all weather data from URL: http://localhost:8080/api/Weather/
Response status: 200
Retrieved 62 weather records
Weather data sample:
        datum  temperatuur weersverwachting
0  2025-06-10    16.100000  Lichte motregen
1  2025-06-09    15.700001  Lichte motregen
2  2025-06-08    14.850000            Regen
3  2025-06-07    14.700000  Lichte motregen
4  2025-06-06    17.099998            Regen

Voor merge:
Trash data shape: (19359, 12)
Weather data shape: (62, 3)

Na merge:
Trash data shape: (19359, 14)
Missing temperatuur: 0
Missing weersverwachting: 0

Sample van merged data:
        datum  temperatuur      weersverwachting
0  2025-04-18    13.100000  Gedeeltelijk bewolkt
1  2025-05-12    21.250000  Gedeeltelijk bewolkt
2  2025-06-06    17.099998                 Regen
3  2025-05-05    11.599999       Lichte motregen
4  2025-04-25    12.600000       Lichte motregen


In [3]:
# Create daily dataset with categories
if 'litterType' in trash.columns:
    daily_waste_by_category = trash.groupby(['datum', 'litterType']).size().unstack(fill_value=0)
    
    # Map categories to standardized names
    category_mapping = {}
    for col in daily_waste_by_category.columns:
        col_lower = col.lower()
        if 'plastic' in col_lower:
            category_mapping[col] = 'Plastic'
        elif 'paper' in col_lower or 'papier' in col_lower:
            category_mapping[col] = 'Papier'
        elif 'organic' in col_lower or 'organisch' in col_lower or 'bio' in col_lower:
            category_mapping[col] = 'Organisch'
        elif 'glass' in col_lower or 'glas' in col_lower:
            category_mapping[col] = 'Glas'
        else:
            category_mapping[col] = 'Overig'
    
    daily_waste_by_category = daily_waste_by_category.rename(columns=category_mapping)
    
    if len(set(category_mapping.values())) < len(category_mapping):
        daily_waste_by_category = daily_waste_by_category.groupby(level=0, axis=1).sum()
    
    expected_categories = ['Plastic', 'Papier', 'Organisch', 'Glas']
    for cat in expected_categories:
        if cat not in daily_waste_by_category.columns:
            daily_waste_by_category[cat] = 0
    
    daily_waste_by_category = daily_waste_by_category[expected_categories]
    daily_waste_by_category['totaal_afval'] = daily_waste_by_category.sum(axis=1)
else:
    daily_waste_by_category = trash.groupby('datum').size().to_frame('totaal_afval')
    daily_waste_by_category['Plastic'] = 0
    daily_waste_by_category['Papier'] = 0
    daily_waste_by_category['Organisch'] = 0
    daily_waste_by_category['Glas'] = 0

# Create aggregation dictionary
agg_dict = {
    'latitude': 'median',
    'longitude': 'median',
    'year': 'first',
    'month': 'first',
    'day': 'first',
    'weekday': 'first',
    'hour': 'mean'
}

# Add weather columns to aggregation if they exist
if 'temperatuur' in trash.columns:
    agg_dict['temperatuur'] = 'mean'
if 'weersverwachting' in trash.columns:
    agg_dict['weersverwachting'] = 'first'

# Aggregate other features per day
daily_features = trash.groupby('datum').agg(agg_dict).reset_index()

# Fill missing values only if needed
if 'temperatuur' in daily_features.columns:
    missing_temp_before = daily_features['temperatuur'].isna().sum()
    if missing_temp_before > 0:
        daily_features['temperatuur'] = daily_features['temperatuur'].fillna(15.0)
else:
    daily_features['temperatuur'] = 15.0

if 'weersverwachting' in daily_features.columns:
    missing_weather_before = daily_features['weersverwachting'].isna().sum()
    if missing_weather_before > 0:
        daily_features['weersverwachting'] = daily_features['weersverwachting'].fillna('Bewolkt')
else:
    daily_features['weersverwachting'] = 'Bewolkt'

# Convert weather description to numeric values
weather_mapping = {
    'Zonnig': 0,
    'Gedeeltelijk bewolkt': 1,
    'Bewolkt': 2,
    'Regenachtig': 3,
    'Regen': 3,
    'Lichte motregen': 3,
    'Onweer': 4,
    'Sneeuw': 5
}

daily_features['weersverwachting_num'] = daily_features['weersverwachting'].map(weather_mapping).fillna(1)

# Combine features with waste counts
daily_data = pd.merge(daily_features, daily_waste_by_category.reset_index(), on='datum')

# Add extra features
daily_data['is_weekend'] = (daily_data['weekday'] >= 5).astype(int)
daily_data['seizoen'] = daily_data['month'].map({
    12: 0, 1: 0, 2: 0,  # Winter
    3: 1, 4: 1, 5: 1,    # Lente  
    6: 2, 7: 2, 8: 2,    # Zomer
    9: 3, 10: 3, 11: 3   # Herfst
})

# Use daily_data for training
trash = daily_data.copy()

print("Daily data shape:", trash.shape)
trash.head()

Daily data shape: (62, 18)


Unnamed: 0,datum,latitude,longitude,year,month,day,weekday,hour,temperatuur,weersverwachting,weersverwachting_num,Plastic,Papier,Organisch,Glas,totaal_afval,is_weekend,seizoen
0,2025-04-10,51.589,4.775,2025,4,10,3,11.672489,11.75,Gedeeltelijk bewolkt,1.0,120,41,45,23,229,0,1
1,2025-04-11,51.589,4.775,2025,4,11,4,11.330317,12.150001,Gedeeltelijk bewolkt,1.0,218,99,93,32,442,0,1
2,2025-04-12,51.589,4.775,2025,4,12,5,11.578571,16.75,Gedeeltelijk bewolkt,1.0,127,64,59,30,280,1,1
3,2025-04-13,51.589,4.775,2025,4,13,6,11.626728,15.2,Regen,3.0,104,44,45,24,217,1,1
4,2025-04-14,51.589,4.775,2025,4,14,0,11.441406,14.2,Lichte motregen,3.0,135,47,50,24,256,0,1


In [15]:
# Define features for Random Forest
features = ['latitude', 'longitude', 'year', 'month', 'day', 'weekday',
           'temperatuur', 'weersverwachting_num', 'is_weekend', 'seizoen']

# Check welke features beschikbaar zijn
available_features = [f for f in features if f in trash.columns]
print("Beschikbare features:", available_features)

# Train Random Forest models voor totaal + 4 categorieën
targets = ['totaal_afval', 'Plastic', 'Papier', 'Organisch', 'Glas']
rf_models = {}

for target_name in targets:
    print(f"\n=== Training Random Forest model voor {target_name} ===")
    
    target = trash[target_name]
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        trash[available_features], target, test_size=0.3, random_state=42
    )

    # Use RandomForestRegressor for waste prediction
    rf = RandomForestRegressor(n_estimators=1000, max_depth=3, random_state=42)
    rf.fit(X_train, y_train)

    predictions_train = rf.predict(X_train)
    predictions_test = rf.predict(X_test)

    # Calculate R² scores - net zoals bij Decision Tree
    accuracy_train = r2_score(y_train, predictions_train)
    accuracy_test = r2_score(y_test, predictions_test)

    print(f"R² score on training set: {accuracy_train:.3f}")
    print(f"R² score on test set: {accuracy_test:.3f}")
    
    # Feature importanc
    
    # Sla model op
    rf_models[target_name] = rf

print(f"\n=== Random Forest Models getraind ===")
print(f"Aantal modellen: {len(rf_models)}")

Beschikbare features: ['latitude', 'longitude', 'year', 'month', 'day', 'weekday', 'temperatuur', 'weersverwachting_num', 'is_weekend', 'seizoen']

=== Training Random Forest model voor totaal_afval ===
R² score on training set: 0.697
R² score on test set: 0.591

=== Training Random Forest model voor Plastic ===
R² score on training set: 0.678
R² score on test set: 0.552

=== Training Random Forest model voor Papier ===
R² score on training set: 0.677
R² score on test set: 0.268

=== Training Random Forest model voor Organisch ===
R² score on training set: 0.679
R² score on test set: 0.447

=== Training Random Forest model voor Glas ===
R² score on training set: 0.708
R² score on test set: 0.400

=== Random Forest Models getraind ===
Aantal modellen: 5


In [16]:
def predict_single_day_rf(date, latitude=51.589, longitude=4.775, 
                         temperatuur=15.0, weersverwachting='Bewolkt'):
    """
    Voorspel afval voor één specifieke dag met Random Forest
    
    Parameters:
    - date: datetime of string in format 'YYYY-MM-DD'
    - latitude: latitude coordinaat (default: 51.589)
    - longitude: longitude coordinaat (default: 4.775) 
    - temperatuur: temperatuur in graden Celsius (default: 15.0)
    - weersverwachting: weer beschrijving (default: 'Bewolkt')
    
    Returns:
    - Dictionary met voorspellingen voor alle afval categorieën
    """
    
    # Convert date to datetime if string
    if isinstance(date, str):
        date = pd.to_datetime(date)
    
    # Extract date features
    year = date.year
    month = date.month
    day = date.day
    weekday = date.weekday()
    hour = 12.0  # Default hour (middag)
    
    # Calculate derived features
    is_weekend = 1 if weekday >= 5 else 0
    seizoen_mapping = {
        12: 0, 1: 0, 2: 0,  # Winter
        3: 1, 4: 1, 5: 1,   # Lente  
        6: 2, 7: 2, 8: 2,   # Zomer
        9: 3, 10: 3, 11: 3  # Herfst
    }
    seizoen = seizoen_mapping[month]
    
    # Map weather description to numeric
    weersverwachting_num = weather_mapping.get(weersverwachting, 1)
    
    # Create input data
    input_data = pd.DataFrame({
        'latitude': [latitude],
        'longitude': [longitude],
        'year': [year],
        'month': [month],
        'day': [day],
        'weekday': [weekday],
        'hour': [hour],
        'temperatuur': [temperatuur],
        'weersverwachting_num': [weersverwachting_num],
        'is_weekend': [is_weekend],
        'seizoen': [seizoen]
    })
    
    # Make predictions with all Random Forest models
    predictions = {}
    for target_name, model in rf_models.items():
        prediction = model.predict(input_data[available_features])[0]
        predictions[target_name] = max(0, round(prediction))
    
    return predictions

# Example usage - predict waste for a specific day
tomorrow = datetime.date.today() + datetime.timedelta(days=1)
rf_prediction = predict_single_day_rf(
    date=tomorrow,
    temperatuur=25.0,
    weersverwachting='Zonnig'
)

print(f"Random Forest afval voorspelling voor {tomorrow}:")
for category, amount in rf_prediction.items():
    print(f"  {category}: {amount} items")

# Voorspelling voor een specifieke datum
specific_date = "2025-06-13"  # Bijvoorbeeld zomervakantie
summer_rf_prediction = predict_single_day_rf(
    date=specific_date,
    temperatuur=20.0,
    weersverwachting='Zonnig'
)

print(f"\nRandom Forest afval voorspelling voor {specific_date}:")
for category, amount in summer_rf_prediction.items():
    print(f"  {category}: {amount} items")

Random Forest afval voorspelling voor 2025-06-12:
  totaal_afval: 393 items
  Plastic: 198 items
  Papier: 78 items
  Organisch: 72 items
  Glas: 40 items

Random Forest afval voorspelling voor 2025-06-13:
  totaal_afval: 382 items
  Plastic: 189 items
  Papier: 77 items
  Organisch: 71 items
  Glas: 39 items


In [None]:
def predict_week_ahead_rf():
    """Voorspel afval voor de komende week met Random Forest"""
    results = []
    today = datetime.date.today()
    
    for i in range(1, 8):  # Next 7 days
        future_date = today + datetime.timedelta(days=i)
        prediction = predict_single_day_rf(
            date=future_date,
            temperatuur=16.0,  # Average temperature
            weersverwachting='Bewolkt'
        )
        
        results.append({
            'datum': future_date,
            'weekdag': future_date.strftime('%A'),
            **prediction
        })
    
    return pd.DataFrame(results)

# Get week predictions
week_rf_predictions = predict_week_ahead_rf()
print(f"\n=== Random Forest voorspellingen voor komende week ===")
print(week_rf_predictions)

# Save Random Forest models for later use
def save_rf_models():
    """Save trained Random Forest models to disk"""
    with open('rf_waste_prediction_models.pkl', 'wb') as f:
        pickle.dump({
            'models': rf_models,
            'features': available_features,
            'weather_mapping': weather_mapping
        }, f)
    print("Random Forest models saved to rf_waste_prediction_models.pkl")

def load_rf_models():
    """Load trained Random Forest models from disk"""
    with open('rf_waste_prediction_models.pkl', 'rb') as f:
        data = pickle.load(f)
    return data['models'], data['features'], data['weather_mapping']

# Save the current Random Forest models
save_rf_models()

In [5]:
# def plot_tree_classification(model, features, class_names, output_file='random_forest'):  
#     if isinstance(model, RandomForestClassifier):
#         pdf = FPDF()

#         for i, tree_model in enumerate(model.estimators_):
#             dot_data = tree.export_graphviz(tree_model, out_file=None, 
#                 feature_names=features,  
#                 class_names=class_names,  
#                 filled=True, rounded=True,  
#                 special_characters=True)  

#             # Turn into graph using graphviz
#             graph = graphviz.Source(dot_data)  

#             # Save as PNG for embedding in PDF
#             image_file = f"{output_file}_tree_{i+1}.png"
#             graph.render(filename=image_file, format='png')

#             # Add each tree image to PDF
#             pdf.add_page()
#             pdf.image(image_file + '.png', x=10, y=10, w=180)

#         # Save the complete PDF
#         pdf_output_file = f"{output_file}.pdf"
#         pdf.output(pdf_output_file)

#         print(f"All trees saved in {pdf_output_file}.")

#     else:
#         raise ValueError("The model is not a RandomForestClassifier.")                                
    
#     return graph

# feature_names = X.columns
# class_names = np.sort(np.unique(y)).astype(str)
# plot_tree_classification(rf, feature_names, class_names)