# Data Exploration

In this notebook describe your data exploration steps.

## Install dependencies

In [1]:
%pip install pandas
%pip install plotly
%pip install nbformat

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Load data

In [22]:
# import the modules
import sqlite3
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# connect to the database
con = sqlite3.connect("../data/data.sqlite")
 
# The following manipulations are done in SQL
# All columns from the table pedestrians are beeing selected and some renamed, but from the column 'time of measurement' only the first 10 characters are selected
# Additionally a column with row numbers is added to the table

# As the temperature and rain data are stored in two different tables, the data is combined in two subqueries
# The average of the temperature and rain data is calculated and a column with row numbers is added to the tables

# The three subqueries are joined together and the columns 'time', 'weekday', 'pedestrians', 'rain' and 'temperature' are selected

df = pd.read_sql_query('''

WITH PedestrianData AS (
    SELECT
        SUBSTR(p.[time of measurement], 1, 10) AS day,
        p.weekday AS weekday,
        p.[pedestrians count] AS pedestrians,
        ROW_NUMBER() OVER (ORDER BY p.[time of measurement]) AS row_num
    FROM pedestrians p
),
RainData AS (
    SELECT
        (r1.[Niederschlag (6 bis 6 UTC)] + r2.[Niederschlag (6 bis 6 UTC)]) / 2 AS rain,
        ROW_NUMBER() OVER (ORDER BY r1.[Niederschlag (6 bis 6 UTC)]) AS row_num
    FROM rainmoe r1, rainnue r2
),
TemperatureData AS (
    SELECT
        (t1.[Mittelwert] + t2.[Mittelwert]) / 2 AS temperature,
        ROW_NUMBER() OVER (ORDER BY t1.[Mittelwert]) AS row_num
    FROM tempmoe t1, tempnue t2
)

SELECT
    pd.day,
    pd.weekday,
    pd.pedestrians,
    rd.rain,
    td.temperature
FROM
    PedestrianData pd
JOIN
    RainData rd ON pd.row_num = rd.row_num
JOIN
    TemperatureData td ON pd.row_num = td.row_num;

''', con)

### Look at the first rows

In [24]:
df.head(20)

Unnamed: 0,day,weekday,pedestrians,rain,temperature
0,2024-01-01,Monday,9432,1.45,0.3
1,2024-01-02,Tuesday,8959,7.2,0.9
2,2024-01-03,Wednesday,10900,5.8,1.9
3,2024-01-04,Thursday,13322,0.45,0.9
4,2024-01-05,Friday,16804,0.0,-0.2
5,2024-01-06,Saturday,8917,0.0,-1.05
6,2024-01-07,Sunday,6496,0.0,-2.45
7,2024-01-08,Monday,11646,0.0,-4.5
8,2024-01-09,Tuesday,12452,0.0,-4.4
9,2024-01-10,Wednesday,11693,0.0,-4.3


### Data exploration
Print some basic information about the data. Your data exploration would continue here.

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   day          134 non-null    object 
 1   weekday      134 non-null    object 
 2   pedestrians  134 non-null    int64  
 3   rain         134 non-null    float64
 4   temperature  134 non-null    float64
dtypes: float64(2), int64(1), object(2)
memory usage: 5.4+ KB


In [26]:
# print min and max values of the columns

print("Min Pedestrians: " + str(df['pedestrians'].min()))
print("Max Pedestrians: " + str(df['pedestrians'].max()))
print("Min Rain: " + str(df['rain'].min()))
print("Max Rain: " + str(df['rain'].max()))
print("Min Temperature: " + str(df['temperature'].min()))
print("Max Temperature: " + str(df['temperature'].max()))

Min Pedestrians: 5469
Max Pedestrians: 29545
Min Rain: 0.0
Max Rain: 7.55
Min Temperature: -5.15
Max Temperature: 7.6


### Print some figures

In [88]:
fig = px.scatter(
    df,
    x="temperature",
    y="rain",
    size="pedestrians",
    color="pedestrians",
    hover_data=["day"],
    labels={'day':'Date', 'pedestrians': 'Number of Pedestrians', 'temperature': 'Temperature in °C', 'rain': 'Rain in mm'},
    title="Rainfall and temperature in relation to the number of pedestrians."
)
fig.show()

fig = px.bar(
    df,
    x="day",
    y="pedestrians",
    color="rain",
    barmode="group",
    labels={'day':'Date', 'pedestrians': 'Number of Pedestrians', 'rain': 'Rain in mm'},
    title="Number of pedestrians per day colored with rainfall in mm."
)
fig.show()

fig = px.bar(
    df,
    x="day",
    y="pedestrians",
    color="temperature",
    barmode="group",
    labels={'day':'Date', 'pedestrians': 'Number of Pedestrians', 'temperature': 'Temperature in °C'},
    title="Number of pedestrians per day colored with temperature in °C."
)
fig.show()

SyntaxError: invalid syntax. Perhaps you forgot a comma? (4189318993.py, line 7)

In [33]:
help(go.Bar)

Help on class Bar in module plotly.graph_objs._bar:

class Bar(plotly.basedatatypes.BaseTraceType)
 |  Bar(arg=None, alignmentgroup=None, base=None, basesrc=None, cliponaxis=None, constraintext=None, customdata=None, customdatasrc=None, dx=None, dy=None, error_x=None, error_y=None, hoverinfo=None, hoverinfosrc=None, hoverlabel=None, hovertemplate=None, hovertemplatesrc=None, hovertext=None, hovertextsrc=None, ids=None, idssrc=None, insidetextanchor=None, insidetextfont=None, legend=None, legendgroup=None, legendgrouptitle=None, legendrank=None, legendwidth=None, marker=None, meta=None, metasrc=None, name=None, offset=None, offsetgroup=None, offsetsrc=None, opacity=None, orientation=None, outsidetextfont=None, selected=None, selectedpoints=None, showlegend=None, stream=None, text=None, textangle=None, textfont=None, textposition=None, textpositionsrc=None, textsrc=None, texttemplate=None, texttemplatesrc=None, uid=None, uirevision=None, unselected=None, visible=None, width=None, widthsr

In [64]:
fig = go.Figure()

# Add surface trace. Use the dataset "df" to plot the surface
fig.add_trace(
    go.Surface(
        z=df.values,
        colorscale="Viridis"
    )
)

# Add dropdown
# fig.update_layout(
#     updatemenus=[
#         dict(
#             buttons=list([
#                 dict(
#                     args=["type", "surface"],
#                     label="3D Surface",
#                     method="restyle"
#                 ),
#                 dict(
#                     args=["type", "heatmap"],
#                     label="Heatmap",
#                     method="restyle"
#                 )
#             ]),
#         ),
#     ]
# )

fig.show()
