## In this notebook, we will explore how to do anomaly detection.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("nyc_taxi.csv")
df.head()

In [None]:
def overview(df: pd.DataFrame, timestamp_col: str = None) -> None:
    print('Null Count:\n', df.isnull().sum(),'\n')
    print('Data Types:\n', df.dtypes)
    
    if timestamp_col is not None:
        df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce')

        print('\nDate Range:\n\nStart:\t',df[timestamp_col].min())
        print('End:\t',df[timestamp_col].max())
        print('Days:\t',(df[timestamp_col].max() - df[timestamp_col].min()))

In [None]:
overview(df, timestamp_col='timestamp')

In [None]:
pip install holoviews bokeh

In [None]:
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')

In [None]:
df.head()

In [None]:
Hourly = hv.Curve(df.set_index('timestamp').resample('h').mean()).opts(
    opts.Curve(title="New York City Taxi Demand Hourly", xlabel="", ylabel="Demand",
               width=700, height=400,tools=['hover'],show_grid=True))

Daily = hv.Curve(df.set_index('timestamp').resample('D').mean()).opts(
    opts.Curve(title="New York City Taxi Demand Daily", xlabel="", ylabel="Demand",
               width=700, height=400,tools=['hover'],show_grid=True))

Weekly = hv.Curve(df.set_index('timestamp').resample('W').mean()).opts(
    opts.Curve(title="New York City Taxi Demand Weekly", xlabel="Date", ylabel="Demand",
               width=700, height=400,tools=['hover'],show_grid=True))


(Hourly + Daily + Weekly).opts(shared_axes=False).cols(1)

#Hourly

In [None]:
df

In [None]:
df_hourly = df.set_index('timestamp').resample('h').mean().reset_index()
df_daily = df.set_index('timestamp').resample('D').mean().reset_index()
df_weekly = df.set_index('timestamp').resample('W').mean().reset_index()

In [None]:
df_daily["value"] = df_daily["value"].round(2)

In [None]:
df_daily

In [None]:
df_hourly["Weekday"] = df_hourly["timestamp"].dt.day_name()

In [None]:
df_hourly.drop(columns=["Weekday"], inplace=True)

In [None]:
for DataFrame in [df_hourly, df_daily]:
    DataFrame['Weekday'] = DataFrame["timestamp"].dt.day_name()
    DataFrame['Hour'] = DataFrame['timestamp'].dt.hour
    DataFrame['Day'] = DataFrame['timestamp'].dt.weekday + 1
    DataFrame['Month'] = DataFrame['timestamp'].dt.month
    DataFrame['Year'] = DataFrame['timestamp'].dt.year
    DataFrame['Month_day'] = DataFrame['timestamp'].dt.day
    DataFrame['Lag'] = DataFrame['value'].shift(1)
    DataFrame['Rolling_Mean'] = DataFrame['value'].rolling(7, min_periods=1).mean()
    DataFrame = DataFrame.dropna()

In [None]:
(hv.Distribution(df['value'])
.opts(opts.Distribution(title="Overall Value Distribution",
                        xlabel="Value",
                        ylabel="Density",
                        width=700, height=300,
                        tools=['hover'],
                        show_grid=True)
     ))

In [None]:
by_weekday = df_hourly.groupby(['Hour','Weekday']).mean()['value'].unstack()
plot = hv.Distribution(by_weekday['Monday'], label='Monday') * hv.Distribution(by_weekday['Tuesday'], label='Tuesday') * hv.Distribution(by_weekday['Wednesday'], label='Wednesday') * hv.Distribution(by_weekday['Thursday'], label='Thursday') * hv.Distribution(by_weekday['Friday'], label='Friday') * hv.Distribution(by_weekday['Saturday'], label='Saturday') *hv.Distribution(by_weekday['Sunday'], label='Sunday').opts(opts.Distribution(title="Demand Density by Day & Hour"))
plot.opts(opts.Distribution(width=800, height=300,tools=['hover'],show_grid=True, ylabel="Freq", xlabel="Demand"))

In [None]:
hv.Bars(df_hourly[['value','Weekday']].groupby('Weekday').sum()).opts(
    opts.Bars(title="New York City Taxi Demand by Day", xlabel="", ylabel="Demand",
               width=700, height=300,tools=['hover'],show_grid=True))

In [57]:
# Define the order of weekdays
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Set the Weekday column as a categorical type with the defined order
df_hourly['Weekday'] = pd.Categorical(df_hourly['Weekday'], categories=weekday_order, ordered=True)

In [58]:
# Group by Weekday and mean, then plot
hv.Bars(df_hourly[['value', 'Weekday']].groupby('Weekday', observed=False).mean()).opts(
    opts.Bars(title="New York City Taxi Demand by Day", xlabel="", ylabel="Demand",
              width=700, height=300, tools=['hover'], show_grid=True))

In [59]:
hv.Curve(df_hourly[['value','Hour']].groupby('Hour').mean()).opts(
    opts.Curve(title="New York City Taxi Demand Hourly", xlabel="Hour", ylabel="Demand",
               width=700, height=300,tools=['hover'],show_grid=True))

In [64]:
# Group data by Hour and Weekday, calculate the mean, and unstack
by_weekday = df_hourly.groupby(['Hour', 'Weekday'], observed=False).mean()['value'].unstack()

# Define the order of weekdays
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Create a list of Curves for each weekday
curves = [hv.Curve(by_weekday[day], label=day) for day in weekday_order]

# Combine all Curves into a single plot
plot = hv.Overlay(curves).opts(
    opts.Curve(title="Average Demand by Day & Hour", width=800, height=300, tools=['hover'], show_grid=True, ylabel="Demand")
)

plot