Clean Code

In [None]:
import knime.scripting.io as knio
import pandas as pd

# Convert the Arrow input table to a Pandas DataFrame
input_table = knio.input_tables[0].to_pandas()

# Process the DataFrame
# Replace missing station names with 'Unknown'
if 'start_station_name' in input_table.columns:
    input_table['start_station_name'] = input_table['start_station_name'].fillna('Unknown')
if 'end_station_name' in input_table.columns:
    input_table['end_station_name'] = input_table['end_station_name'].fillna('Unknown')

# Drop rows with missing or invalid coordinates
if all(col in input_table.columns for col in ['start_lat', 'start_lng', 'end_lat', 'end_lng']):
    input_table = input_table.dropna(subset=['start_lat', 'start_lng', 'end_lat', 'end_lng'])

# Convert back to Arrow table and assign to KNIME output
knio.output_tables[0] = knio.Table.from_pandas(input_table)


Remove Outliers

In [None]:
import knime.scripting.io as knio
import pandas as pd

# Convert the Arrow input table to a Pandas DataFrame
input_table = knio.input_tables[0].to_pandas()

# Filter out trips with unrealistic durations (<1 min or >24 hours)
if 'trip_duration_minutes' in input_table.columns:
    input_table = input_table[(input_table['trip_duration_minutes'] >= 1) &
                              (input_table['trip_duration_minutes'] <= 1440)]

# Convert back to Arrow table and assign to KNIME output
knio.output_tables[0] = knio.Table.from_pandas(input_table)


Time of the Day

In [None]:
import knime.scripting.io as knio
import pandas as pd

# Convert the Arrow input table to a Pandas DataFrame
input_table = knio.input_tables[0].to_pandas()

# Add a time of day column
def get_time_of_day(timestamp):
    hour = timestamp.hour
    if 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 24:
        return 'Evening'
    else:
        return 'Night'

if 'started_at' in input_table.columns:
    input_table['time_of_day'] = input_table['started_at'].apply(get_time_of_day)
#comment
# Convert back to Arrow table and assign to KNIME output
knio.output_tables[0] = knio.Table.from_pandas(input_table)


Aggregrate Data and Display Map

In [None]:
import knime.scripting.io as knio
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Convert the Arrow input table to a Pandas DataFrame
input_table = knio.input_tables[0].to_pandas()

# Select the first 10 trips
input_table = input_table.head(100)

# Create a DataFrame for start and end points
start_points = input_table[['ride_id', 'start_lat', 'start_lng', 'start_station_name']]
end_points = input_table[['ride_id', 'end_lat', 'end_lng', 'end_station_name']]

# Rename columns for merging
start_points.columns = ['ride_id', 'lat', 'lng', 'station_name']
end_points.columns = ['ride_id', 'lat', 'lng', 'station_name']

# Add a column to distinguish start and end points
start_points['type'] = 'Start'
end_points['type'] = 'End'

# Concatenate start and end points
all_points = pd.concat([start_points, end_points])

# Create a scatter mapbox plot for start and end points
fig = px.scatter_mapbox(
    all_points,
    lat='lat',
    lon='lng',
    color='type',
    hover_name='station_name',
    hover_data={'ride_id': True},
    zoom=10,
    height=600
)

# Add lines to the map to show the routes
for i, row in input_table.iterrows():
    fig.add_trace(go.Scattermapbox(
        mode="lines",
        lon=[row['start_lng'], row['end_lng']],
        lat=[row['start_lat'], row['end_lat']],
        line=dict(width=2, color='blue'),
        name=row['ride_id']
    ))

# Update layout for better visualization
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

# Show the plot
fig.show()

# Output the input table as the output table
knio.output_tables[0] = knio.Table.from_pandas(input_table)

Station Popularity

In [None]:
import knime.scripting.io as knio
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Convert the Arrow input table to a Pandas DataFrame
input_table = knio.input_tables[0].to_pandas()

# Count trips by station
start_popularity = input_table['start_station_name'].value_counts().reset_index()
start_popularity.columns = ['Station Name', 'Start Trips']

end_popularity = input_table['end_station_name'].value_counts().reset_index()
end_popularity.columns = ['Station Name', 'End Trips']

# Merge and calculate total trips
popularity = pd.merge(start_popularity, end_popularity, on='Station Name', how='outer').fillna(0)
popularity['Total Trips'] = popularity['Start Trips'] + popularity['End Trips']

# Get the most popular stations
most_popular_stations = popularity.sort_values(by='Total Trips', ascending=False).head(10)

# Merge with original data to get coordinates
start_coords = input_table[['start_station_name', 'start_lat', 'start_lng']].drop_duplicates()
end_coords = input_table[['end_station_name', 'end_lat', 'end_lng']].drop_duplicates()

start_coords.columns = ['Station Name', 'lat', 'lng']
end_coords.columns = ['Station Name', 'lat', 'lng']

coords = pd.concat([start_coords, end_coords]).drop_duplicates()

# Merge coordinates with most popular stations
most_popular_stations = pd.merge(most_popular_stations, coords, on='Station Name', how='left')

# Convert 'Total Trips' to a list for the size parameter
size_list = most_popular_stations['Total Trips'].tolist()

# Create a scatter mapbox plot for the most popular stations
fig = px.scatter_mapbox(
    most_popular_stations,
    lat='lat',
    lon='lng',
    size=size_list,
    hover_name='Station Name',
    hover_data={'Start Trips': True, 'End Trips': True, 'Total Trips': True},
    zoom=10,
    height=600
)

# Update layout for better visualization
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

# Add station names to the side of the map
station_names = most_popular_stations['Station Name'].tolist()
annotations = [
    dict(
        x=1.05,
        y=1 - (i / len(station_names)),
        xref='paper',
        yref='paper',
        text=station,
        showarrow=False,
        font=dict(size=12)
    ) for i, station in enumerate(station_names)
]

fig.update_layout(annotations=annotations)

# Save the plot to an HTML file
fig.write_html("most_popular_stations_map.html")

# Output the popularity table as the output table
knio.output_tables[0] = knio.Table.from_pandas(most_popular_stations)

Attempt at Geo-spacial Heatmap

In [None]:
import knime.scripting.io as knio
import pandas as pd

# Convert the Arrow input table to a Pandas DataFrame
input_table = knio.input_tables[0].to_pandas()

# Since the 'folium' module is not available, we will not use it.
# Instead, we will perform a simple data transformation and output the table.

# Example transformation: Calculate the net trips (Start Trips - End Trips)
input_table['Net Trips'] = input_table['Start Trips'] - input_table['End Trips']

# Output the transformed table
knio.output_tables[0] = knio.Table.from_pandas(input_table)