In [2]:
import polars as pl
import numpy as np
import pandas as pd
import plotly.express as px

In [3]:
# User CLI Inputs: CSV__INPUT, TIME__PERIOD__LENGTH
CSV__INPUT = 'reddit_vm'

In [4]:
def load_csv(CSV__INPUT: str):
    """
    Given user selection in the CLI of a CSV to analyze, the CSV is read in as a Polars dataframe and an attempt at auto-detection of
    the datetime column is made.

    Parameters:
    - CSV__INPUT: the name of the csv file in string format

    Returns:
    - df: a Polars dataframe made from the loaded CSV with a chosen datetime column formatted.
    - datetime_column_name: a string value of the chosen datetime column name. 

    # TODO: List out all possible datetime formats that Polars can automatically detect for transparency to non-technical users. We can potentially code out other options that Polars does not cover. 
    """

    df = pl.read_csv(f'../../sample_data/{CSV__INPUT}.csv', try_parse_dates = True)
    dtype_dict = dict(zip(df.columns, df.dtypes))

    datetime_cols = [col for col, dtype in dtype_dict.items() if isinstance(dtype, pl.Datetime)]
    if len(datetime_cols) == 0:
        raise TypeError('No datetime columns were found.')
    elif len(datetime_cols) > 1:
        print(f'More than one datetime column found. Please choose one of the following to analyze: {datetime_cols}')
        # TODO: Create CLI interaction where user must choose which datetime column to analyze and relabel dataset.
        raise TypeError('Multiple datetime columns were found. Choosing a column on CLI is not yet supported, please remove other datetime columns for the time being.')
    else:
        print(f'Datetime column found: {datetime_cols[0]}')
        datetime_col_name = datetime_cols[0]

    display(df.head(5))
    return df, datetime_col_name
    
df, datetime_col_name = load_csv(CSV__INPUT)

Datetime column found: timestamp


title,score,id,url,comms_num,created,body,timestamp
str,i64,str,str,i64,f64,str,datetime[μs]
"""Health Canada approves AstraZe…",7,"""lt74vw""","""https://www.canadaforums.ca/20…",0,1614400000.0,,2021-02-27 06:33:45
"""COVID-19 in Canada: 'Vaccinati…",2,"""lsh0ij""","""https://www.canadaforums.ca/20…",1,1614300000.0,,2021-02-26 07:11:07
"""Coronavirus variants could fue…",6,"""lohlle""","""https://www.canadaforums.ca/20…",0,1613900000.0,,2021-02-21 07:50:08
"""Canadian government to extend …",1,"""lnptv8""","""https://www.canadaforums.ca/20…",0,1613800000.0,,2021-02-20 06:35:13
"""Canada: Pfizer is 'extremely c…",6,"""lkslm6""","""https://www.canadaforums.ca/20…",0,1613500000.0,,2021-02-16 11:36:28


In [8]:
def process_datetime_feature_engineering(df: pl.DataFrame, datetime_col_name: str):
    """
    Based off of a datetime column, create useful datetime-related columns such as hour, minute, and minute of day. 

    Parameters:
    - df: a Polars dataframe containing a column with the datetime datatype.
    - datetime_col_name: a chosen column name (str) resembling the datetime data

    Returns:
    - df: a Polars dataframe with additional columns extracted from datetime_col_name of df

    """
    df = df.with_columns(
            pl.col(datetime_col_name).dt.cast_time_unit("ms").dt.replace_time_zone(None)
        ).with_columns(
            (pl.col(datetime_col_name).dt.hour()).cast(int).alias("hour"), # extract hour from timestamp and ensure that the integer type has a large enough bit-size (defaults to i8 without casting)
            (pl.col(datetime_col_name).dt.minute().cast(int).alias("minute")),  # extract minute from timestamp and ensure that the integer type has a large enough bit-size (defaults to i8 without casting)
        ).with_columns(
            (pl.col("hour")*60 + pl.col("minute")).alias("minute_of_day") # get the minute marker in the day
        )
    return df

df = process_datetime_feature_engineering(df, datetime_col_name)
df.head()

title,score,id,url,comms_num,created,body,timestamp,hour,minute,minute_of_day
str,i64,str,str,i64,f64,str,datetime[ms],i64,i64,i64
"""Health Canada approves AstraZe…",7,"""lt74vw""","""https://www.canadaforums.ca/20…",0,1614400000.0,,2021-02-27 06:33:45,6,33,393
"""COVID-19 in Canada: 'Vaccinati…",2,"""lsh0ij""","""https://www.canadaforums.ca/20…",1,1614300000.0,,2021-02-26 07:11:07,7,11,431
"""Coronavirus variants could fue…",6,"""lohlle""","""https://www.canadaforums.ca/20…",0,1613900000.0,,2021-02-21 07:50:08,7,50,470
"""Canadian government to extend …",1,"""lnptv8""","""https://www.canadaforums.ca/20…",0,1613800000.0,,2021-02-20 06:35:13,6,35,395
"""Canada: Pfizer is 'extremely c…",6,"""lkslm6""","""https://www.canadaforums.ca/20…",0,1613500000.0,,2021-02-16 11:36:28,11,36,696


In [112]:
# The CLI gives the user a choice for how many minutes they want to break down the temporal data.
TIME__PERIOD__LENGTH = 30
# TODO: add capacity for unique users

def create_time_period_label_dict(TIME__PERIOD__LENGTH):
    """
    A helper function to generate a dictionary assigned time period index to time period labels
    """
    # extract the hour of the day and minute of the hour for the starting time of each period
    start_time_array = np.array([str(i//60).zfill(2) + ":" + str(int(np.round(((i/60)%1)*60))).zfill(2) for i in np.arange(0, 1440, TIME__PERIOD__LENGTH)])
    
    time_period_dict = {}
    for i in range(len(start_time_array)):
        if i == len(start_time_array) - 1:
            time_period_dict[i] = start_time_array[i] + "-" + "00:00"
        else:
            time_period_dict[i] = start_time_array[i] + "-" + start_time_array[i+1]
    return time_period_dict

def analyze_time_of_day(df, TIME__PERIOD__LENGTH=60):
    """
    Based on the provided Polars dataframe and a specified number of minutes to divide a day up into, group the data 
    into TIME__PERIOD__LENGTH-long periods. After that, convert it into a Pandas dataframe and display a bar graph using
    Plotly Express detailing the record frequency over time. 

    Parameters:
    - df: a Polars dataframe containing a column with the datetime datatype.
    - datetime_col_name: a chosen column name (str) resembling the datetime data

    Returns:
    - df: a Polars dataframe with additional columns extracted from datetime_col_name of df

    """
    if (1440/TIME__PERIOD__LENGTH % 1) != 0:
        print(f"Warning: The number of minutes you provided ({TIME__PERIOD__LENGTH}) do not divide evenly so the created time periods will not be equal.\
        \nWe recommend numbers that are factors of 1440 such as 10, 15, 30, 45, 60, 120, etc.")
        # TODO: Add a CLI of 'Would you like to proceed?'

    # Group the dataframe based on TIME__PERIOD__LENGTH parameter
    grouped_df = df.with_columns(
                    (pl.col("minute_of_day") // TIME__PERIOD__LENGTH).alias("minute_split_group")
                ).group_by("minute_split_group").agg([
                    pl.col("minute_of_day").count().alias("count")
                ]).sort("minute_split_group")

    # Relabel minute_split_group values to time periods.
    time_period_label_dict = create_time_period_label_dict(TIME__PERIOD__LENGTH)
    grouped_df = grouped_df.with_columns(
                    grouped_df['minute_split_group'].replace_strict(time_period_label_dict).alias("minute_split_group")
                )

    # Create the Plotly bar graph
    fig = px.bar(grouped_df.to_pandas(), 
                x='minute_split_group', 
                y='count', 
                orientation='v',
                title=f'Record Frequency Split by {TIME__PERIOD__LENGTH} Minutes', 
                labels={'minute_split_group': f'{TIME__PERIOD__LENGTH}-Minute Period Label', 'count': 'Count'},
            )

    # Show the plot
    fig.show() # TODO: Allow for graph export and/or upload to HTML

    return grouped_df

grouped_tod_df = analyze_time_of_day(df, TIME__PERIOD__LENGTH)
grouped_tod_df.head()

minute_split_group,count
str,u32
"""00:00-00:30""",26
"""00:30-01:00""",45
"""01:00-01:30""",40
"""01:30-02:00""",33
"""02:00-02:30""",38


In [9]:

# TODO LIST:
### DATE BREAKDOWN
### DAY OF WEEK BREAKDOWN