In [1]:
import polars as pl
import pandas as pd
import plotly.express as px

In [2]:
# User CLI Inputs: CSV_INPUT, MINUTE_SPLIT_INPUT

In [3]:
CSV_INPUT = 'reddit_vm'

def load_csv(CSV_INPUT):
    """
    Given user selection in the CLI of a csv to analyze, the csv is read in as a Polars dataframe assigned to the df variable. 

    # TODO: List out all possible datetime formats that Polars can automatically detect for transparency to non-technical users. We can potentially code out other options that Polars does not cover. 

    """
    df = pl.read_csv(f'../../sample_data/{CSV_INPUT}.csv', try_parse_dates = True)
    dtype_dict = dict(zip(df.columns, df.dtypes))

    datetime_cols = [col for col, dtype in dtype_dict.items() if isinstance(dtype, pl.Datetime)]
    if len(datetime_cols) > 1:
        print(f'More than one datetime column found. Please choose one of the following to analyze: {datetime_cols}')
    elif len(datetime_cols) == 0:
        raise TypeError('No datetime columns were found.')
    else:
        print(f'Datetime column found: {datetime_cols[0]}')
    display(df.head(5))
    return df
    
df = load_csv(CSV_INPUT)

Datetime column found: timestamp


title,score,id,url,comms_num,created,body,timestamp
str,i64,str,str,i64,f64,str,datetime[μs]
"""Health Canada approves AstraZe…",7,"""lt74vw""","""https://www.canadaforums.ca/20…",0,1614400000.0,,2021-02-27 06:33:45
"""COVID-19 in Canada: 'Vaccinati…",2,"""lsh0ij""","""https://www.canadaforums.ca/20…",1,1614300000.0,,2021-02-26 07:11:07
"""Coronavirus variants could fue…",6,"""lohlle""","""https://www.canadaforums.ca/20…",0,1613900000.0,,2021-02-21 07:50:08
"""Canadian government to extend …",1,"""lnptv8""","""https://www.canadaforums.ca/20…",0,1613800000.0,,2021-02-20 06:35:13
"""Canada: Pfizer is 'extremely c…",6,"""lkslm6""","""https://www.canadaforums.ca/20…",0,1613500000.0,,2021-02-16 11:36:28


In [4]:
def process_datetime_feature_engineering(df):
    df = df.with_columns(
        pl.col("timestamp").dt.cast_time_unit("ms").dt.replace_time_zone(None)
    )

    df = df.with_columns(
        (pl.col("timestamp").dt.hour()).cast(int).alias("hour"), # extract hour from timestamp and ensure that the integer type has a large enough bit-size (defaults to i8 without casting)
        (pl.col("timestamp").dt.minute().cast(int).alias("minute")),  # extract minute from timestamp and ensure that the integer type has a large enough bit-size (defaults to i8 without casting)
    )

    df = df.with_columns(
        (pl.col("hour")*60 + pl.col("minute")).alias("minute_of_day") # get the minute marker in the day
    )

    return df

In [5]:
df = process_datetime_feature_engineering(df)
df.head()

title,score,id,url,comms_num,created,body,timestamp,hour,minute,minute_of_day
str,i64,str,str,i64,f64,str,datetime[ms],i64,i64,i64
"""Health Canada approves AstraZe…",7,"""lt74vw""","""https://www.canadaforums.ca/20…",0,1614400000.0,,2021-02-27 06:33:45,6,33,393
"""COVID-19 in Canada: 'Vaccinati…",2,"""lsh0ij""","""https://www.canadaforums.ca/20…",1,1614300000.0,,2021-02-26 07:11:07,7,11,431
"""Coronavirus variants could fue…",6,"""lohlle""","""https://www.canadaforums.ca/20…",0,1613900000.0,,2021-02-21 07:50:08,7,50,470
"""Canadian government to extend …",1,"""lnptv8""","""https://www.canadaforums.ca/20…",0,1613800000.0,,2021-02-20 06:35:13,6,35,395
"""Canada: Pfizer is 'extremely c…",6,"""lkslm6""","""https://www.canadaforums.ca/20…",0,1613500000.0,,2021-02-16 11:36:28,11,36,696


In [13]:
# The CLI gives the user a choice for how many minutes they want to break down the temporal data.
MINUTE_SPLIT_INPUT = 60
# TODO: add capacity for unique users

def run_temporal_test_1(input_df, MINUTE_SPLIT_INPUT=60):
    df = input_df.with_columns(
        (pl.col("minute_of_day") // MINUTE_SPLIT_INPUT).alias("minute_split_group")
    )

    result_df = df.group_by("minute_split_group").agg([
        pl.col("minute_of_day").count().alias("count")
    ])

    # Create the Plotly graph
    fig = px.bar(result_df.to_pandas(), 
                x='minute_split_group', 
                y='count', 
                orientation='v',
                title=f'Record Frequency Split by {MINUTE_SPLIT_INPUT} Minutes', 
                labels={'minute_split_group': f'{MINUTE_SPLIT_INPUT}-Minute Period Label', 'count': 'Count'},
            )

    # Show the plot
    fig.show()

    return result_df.sort("minute_split_group")

tt1_df = run_temporal_test_1(df, MINUTE_SPLIT_INPUT)
tt1_df.head()

minute_split_group,count
i64,u32
0,71
1,73
2,88
3,75
4,52


In [9]:

# TODO LIST:
### DATE BREAKDOWN
### DAY OF WEEK BREAKDOWN