In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

ftwue_path = kagglehub.competition_download('ftwue')

print('Data source import complete.')


# Foot Traffic in Würzburg

### Short Kaggle Info

In [None]:
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Exploratory Data Analysis (EDA)

In [None]:
# Libraries

import numpy as np
import pandas as pd

In [None]:
sample_submission = pd.read_csv(f'{ftwue_path}/sample_submission.csv')
sample_submission

In [None]:
train_df = pd.read_csv(f'{ftwue_path}/train.csv')
train_df.head(2)

In [None]:
# View initial columns
train_df.info()

In [None]:
test_df = pd.read_csv(f'{ftwue_path}/test.csv')
test_df.head(2)

In [None]:
# View initial columns
test_df.info()

In [None]:
json_locations = open(f'{ftwue_path}/counterGeoLocations.json', 'r')
print(json_locations.read())
# json_locations.close()

In [None]:
## Setting the features and target variables

In [None]:
y = train_df["n_pedestrians"]
X = X = train_df.drop(columns=["n_pedestrians"])

print(X.shape)  # (82821, 12) -> X features
print(y.shape)  # (82821,) -> y target variable (demand)

In [None]:
# Add feature: Season

X['date'] = pd.to_datetime(X['date'])

# New columns: Extract the missing date features (month and year)
X['month'] = X['date'].dt.month
X['year'] = X['date'].dt.year

# Map months to seasons
X['season'] = X['month'].apply(lambda x:
    'winter' if x in [12, 1, 2] else
    'spring' if x in [3, 4, 5] else
    'summer' if x in [6, 7, 8] else
    'autumn'
)

In [None]:
# Create one-hot encodings

X = pd.get_dummies(X, columns=["year","day","month","streetname","season"])

In [None]:
# View columns
X.columns

**Ensuring data coverage by street**

In [None]:
data = train_df.copy()

# Dictionary to store the full time series DataFrame for each streetname combination
street_time_series_dict = {}

# Iterate over each group and store the full DataFrame as the time series
for streetname, group in data.groupby('streetname'):  # Group only by 'streetname', no tuple
    # Sort the group by 'date' to ensure chronological order
    group = group.sort_values(by='date')

    # Include both 'n_pedestrians_towards' and 'n_pedestrians_away' in the time series
    time_series_df = group[['date', 'n_pedestrians_towards', 'n_pedestrians_away']].copy()

    # Add a total pedestrians column
    time_series_df['total_pedestrians'] = group['n_pedestrians_towards'] + group['n_pedestrians_away']

    # Store the full time series DataFrame for this streetname
    street_time_series_dict[streetname] = time_series_df

# Display an example of a time series for a specific streetname (optional)
for key, time_series in street_time_series_dict.items():
    print(f"Time series for {key}:\n", time_series.head())
    # break  # Uncomment if you only want to see one street's time series

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Loop through each street and plot the total pedestrians time series
for streetname, time_series in street_time_series_dict.items():
    plt.figure(figsize=(10, 6))  # Create a new figure for each street

    # Plot total pedestrians
    plt.plot(time_series['date'], time_series['total_pedestrians'], label=f'{streetname} - Total', color='green')

    # Add title and labels
    plt.title(f'Total Pedestrians for {streetname}')
    plt.xlabel('Date')
    plt.ylabel('Total Number of Pedestrians')

    # Format the x-axis to show quarterly ticks (every 3 months)
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(bymonthday=1, interval=3))  # Every 3 months
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))  # Format as Year-Month

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45)

    # Add a legend
    plt.legend()

    # Adjust layout to prevent overlap
    plt.tight_layout()

    # Show plot
    plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Using a standard scaler only on continuous variables

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming X contains your features and y your target

# Separate the date column from X for train-test splitting
date_column = X['date']  # Keep date separately for reference (if you want to use it later)

# Perform train-test split without 'date' column (already done previously)
X_train, X_test, y_train, y_test = train_test_split(
    X.drop(columns=['date']), y, train_size=0.75, shuffle=True
)

# Standardization (scaling) for continuous features
scaler = StandardScaler()
scaler.fit(X_train[['n_pedestrians_towards', 'n_pedestrians_away', 'temperature']])  # Use continuous columns

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[['n_pedestrians_towards', 'n_pedestrians_away', 'temperature']] = scaler.transform(X_train[['n_pedestrians_towards', 'n_pedestrians_away', 'temperature']])
X_test_scaled[['n_pedestrians_towards', 'n_pedestrians_away', 'temperature']] = scaler.transform(X_test[['n_pedestrians_towards', 'n_pedestrians_away', 'temperature']])

# Reset the indices of the train and test sets to ensure alignment
X_train_scaled.reset_index(drop=True, inplace=True)
X_test_scaled.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Concatenate date, scaled features X and scaled target y for train and test sets
train_data = pd.concat([date_column.loc[X_train.index].reset_index(drop=True), X_train_scaled, y_train], axis=1)
test_data = pd.concat([date_column.loc[X_test.index].reset_index(drop=True), X_test_scaled, y_test], axis=1)

# Print the info of the train_data
train_data.info()

### Aggregation by month and year for seasonal analysis

In [None]:
# If needed, aggregate by 'year-month' for both train and test sets
train_data['year_month'] = train_data['date'].dt.to_period('M').dt.to_timestamp()
test_data['year_month'] = test_data['date'].dt.to_period('M').dt.to_timestamp()

# Group by year-month and aggregate the 'demand' values
train_df = train_data.groupby('year_month').agg({'demand': 'sum'}).reset_index()
test_df = test_data.groupby('year_month').agg({'demand': 'sum'}).reset_index()

# Count the number of unique year-month values in the train set
print(train_df['year_month'].count())  # This will show the number of months in the training data


## Run some sample models

### RNN

In [None]:
print(test_df.columns)
