In [20]:
import os
import re
from multiprocessing import Pool

import pandas as pd


In [21]:
# replace all cells without numbers with NA
def replace_non_digit_string(data_frame):
    mask = ~data_frame[['Salary', 'BirthDate', 'Time']].apply(lambda col: col.astype(str).str.contains(r'\d')).any(axis=1)
    data_frame.loc[mask, ['Name', 'BirthDate', 'Time']] = pd.NA
    return data_frame


In [22]:
# remove all empty strings
def remove_empty_strings(data_frame):
    data_frame = data_frame.dropna(how='all')
    return data_frame


In [23]:
# remove all duplicates
def remove_duplicates(date_frame):
    date_frame = date_frame.drop_duplicates()
    return date_frame


In [24]:
# delete all lines between 1:00:00 and 3:00:00
def remove_by_time(data_frame):
    start_time = pd.to_datetime('01:00:00').time()
    end_time = pd.to_datetime('03:00:00').time()

    data_frame['Time'] = pd.to_datetime(data_frame['Time'], format='%H:%M:%S', errors='coerce').dt.time

    mask = (data_frame['Time'].notna()) & (data_frame['Time'] > start_time) & (data_frame['Time'] < end_time)
    data_frame.loc[mask, ['Name', 'BirthDate', 'Time']] = pd.NA
    return data_frame


In [25]:
# replace all empty cells with NA
df = pd.read_csv('.test_data_set.csv')
# df = pd.read_csv('test.csv')

df.fillna('')

df = replace_non_digit_string(df)   
df = remove_empty_strings(df)
df = remove_duplicates(df)
df = remove_by_time(df)


In [None]:
# get unique rows for every hour, get median values and mean values
def metrics_calculation(data_frame):
    data_frame['Time'] = pd.to_datetime(data_frame['Time'], format='%H:%M:%S').dt.time
    
    data_frame['Hour'] = pd.to_datetime(data_frame['Time'].astype(str), format='%H:%M:%S').dt.hour
   
    grouped = data_frame.groupby('Hour')

    unique_counts = grouped['Name'].nunique()
    mean_values = grouped['Salary'].mean()
    median_values = grouped['Salary'].median()

    result = pd.DataFrame({
        'UniqueCounts': unique_counts,
        'MeanValues': mean_values,
        'MedianValues': median_values
    })

    return result

metrics = metrics_calculation(df)


In [None]:
metrics

112634.0