In [1]:
import os
import shutil
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
if 'google.colab' in str(get_ipython()):
  from google.colab import drive

  os.makedirs('./datasets', exist_ok=True)
  drive.mount('/content/drive')

  shutil.copy('/content/drive/MyDrive/DataScience/Analytics/Estudo Fraude/df_train_test.parquet','/content/datasets/')
  shutil.copy('/content/drive/MyDrive/DataScience/Analytics/Estudo Fraude/df_validation.parquet','/content/datasets/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pl.concat([pl.read_parquet('./datasets/df_train_test.parquet'), pl.read_parquet('./datasets/df_validation.parquet')])
df.head()

id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors,description,target,current_age,retirement_age,birth_year,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web
i64,datetime[ns],i64,i64,f64,str,i64,str,str,f64,str,str,str,str,i64,i64,i64,i64,str,str,f64,f64,f64,f64,f64,i64,i64,str,str,i64,str,i64,str,i64,f64,str,i64,str
7475327,2010-01-01 00:01:00,1556,2972,-77.0,"""Swipe Transaction""",59935,"""Beulah""","""ND""",58523.0,"""5499""",,"""Miscellaneous Food Stores""","""No""",30,67,1989,7,"""Female""","""594 Mountain View Street""",46.8,-100.76,23679.0,48277.0,110153.0,740,4,"""Mastercard""","""Debit (Prepaid)""",5497590243197280,"""07/2022""",306,"""YES""",2,55.0,"""05/2008""",2008,"""No"""
7475328,2010-01-01 00:02:00,561,4575,14.57,"""Swipe Transaction""",67570,"""Bettendorf""","""IA""",52722.0,"""5311""",,"""Department Stores""","""No""",48,67,1971,6,"""Male""","""604 Pine Street""",40.8,-91.12,18076.0,36853.0,112139.0,834,5,"""Mastercard""","""Credit""",5175842699412235,"""12/2024""",438,"""YES""",1,9100.0,"""09/2005""",2015,"""No"""
7475329,2010-01-01 00:02:00,1129,102,80.0,"""Swipe Transaction""",27092,"""Vista""","""CA""",92084.0,"""4829""",,"""Money Transfer""","""No""",49,65,1970,4,"""Male""","""2379 Forest Lane""",33.18,-117.29,16894.0,34449.0,36540.0,686,3,"""Mastercard""","""Debit""",5874992802287595,"""05/2020""",256,"""YES""",1,14802.0,"""01/2006""",2008,"""No"""
7475332,2010-01-01 00:06:00,848,3915,46.41,"""Swipe Transaction""",13051,"""Harwood""","""MD""",20776.0,"""5813""",,"""Drinking Places (Alcoholic Bev…","""No""",51,69,1968,5,"""Male""","""166 River Drive""",38.86,-76.6,33529.0,68362.0,96182.0,711,2,"""Visa""","""Debit""",4354185735186651,"""01/2020""",120,"""YES""",1,19113.0,"""07/2009""",2014,"""No"""
7475333,2010-01-01 00:07:00,1807,165,4.81,"""Swipe Transaction""",20519,"""Bronx""","""NY""",10464.0,"""5942""",,"""Book Stores""","""No""",47,65,1972,12,"""Female""","""14780 Plum Lane""",40.84,-73.87,25537.0,52065.0,98613.0,828,5,"""Mastercard""","""Debit (Prepaid)""",5207231566469664,"""03/2014""",198,"""YES""",1,89.0,"""01/2008""",2015,"""No"""


In [4]:
df = df.with_columns(
    pl.when(pl.col('merchant_city').str.to_lowercase() == 'online')
    .then(pl.lit('Yes'))
    .otherwise(pl.lit('No'))
    .alias('online_sales')
).with_columns(
    pl.when(pl.col('merchant_city').str.to_lowercase() == 'online')
    .then(pl.lit('Online'))
    .otherwise(pl.col('merchant_city'))
    .alias('city')
).with_columns(
    pl.when(pl.col('merchant_state').str.len_chars() == 2)
    .then(pl.lit('United States'))
    .otherwise(pl.col('merchant_state'))
    .alias('country')
    .fill_null('Online')
)
df.sample(5)

id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors,description,target,current_age,retirement_age,birth_year,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web,online_sales,city,country
i64,datetime[ns],i64,i64,f64,str,i64,str,str,f64,str,str,str,str,i64,i64,i64,i64,str,str,f64,f64,f64,f64,f64,i64,i64,str,str,i64,str,i64,str,i64,f64,str,i64,str,str,str,str
9058659,2011-01-22 17:35:00,977,1150,23.1,"""Swipe Transaction""",32606,"""Dallas""","""TX""",75211.0,"""7832""",,"""Motion Picture Theaters""","""No""",83,67,1936,12,"""Female""","""237 Hill Boulevard""",32.59,-96.68,13917.0,22873.0,0.0,710,5,"""Mastercard""","""Debit""",5352993424347809,"""01/2022""",551,"""YES""",2,5649.0,"""02/2008""",2016,"""No""","""No""","""Dallas""","""United States"""
8047026,2010-05-24 12:25:00,1694,3079,100.0,"""Swipe Transaction""",27092,"""Caldwell""","""NJ""",7006.0,"""4829""",,"""Money Transfer""","""No""",83,65,1936,3,"""Female""","""480 Catherine Drive""",41.09,-73.55,37213.0,74753.0,1585.0,726,5,"""Visa""","""Credit""",4325110497566622,"""12/2024""",658,"""YES""",1,16900.0,"""05/2010""",2010,"""No""","""No""","""Caldwell""","""United States"""
18322345,2016-09-03 16:14:00,885,5852,44.93,"""Chip Transaction""",65881,"""Cabool""","""MO""",65689.0,"""5311""",,"""Department Stores""","""No""",36,66,1983,12,"""Male""","""7121 Fourth Avenue""",36.67,-93.86,13895.0,28334.0,0.0,700,3,"""Discover""","""Credit""",6566463560968460,"""06/2021""",495,"""YES""",1,9000.0,"""12/2006""",2006,"""No""","""No""","""Cabool""","""United States"""
13792432,2013-12-30 12:38:00,752,2465,63.57,"""Swipe Transaction""",13646,"""Patchogue""","""NY""",11772.0,"""7538""",,"""Automotive Service Shops""","""No""",65,64,1954,6,"""Female""","""5814 Lincoln Lane""",40.82,-72.98,27376.0,64829.0,15537.0,664,3,"""Mastercard""","""Credit""",5971047131415221,"""02/2018""",191,"""NO""",1,12900.0,"""04/2004""",2010,"""No""","""No""","""Patchogue""","""United States"""
12141759,2012-12-30 13:17:00,1696,2408,62.0,"""Swipe Transaction""",59935,"""Merritt Island""","""FL""",32953.0,"""5499""",,"""Miscellaneous Food Stores""","""No""",63,65,1956,12,"""Female""","""4461 Hill Street""",28.32,-80.68,26339.0,53702.0,85160.0,606,1,"""Mastercard""","""Debit""",5412731781518758,"""05/2018""",599,"""YES""",2,12047.0,"""04/2000""",2008,"""No""","""No""","""Merritt Island""","""United States"""


In [5]:
def floor_to_15min(dt_value: datetime) -> datetime|None:
    if dt_value is None:
        return None

    seconds_since_midnight = (dt_value - dt_value.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds()
    interval_seconds = 15 * 60
    floored_seconds = (seconds_since_midnight // interval_seconds) * interval_seconds
    return dt_value.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(seconds=floored_seconds)

In [6]:
df = df.with_columns(
    pl.col("date").map_elements(floor_to_15min, return_dtype=pl.Datetime).alias("date_window")
)

In [7]:
def generate_columns_statistics(column_name: str) -> list:
    return [
        pl.col(column_name).mean().alias(f'mean_{column_name}'),
        pl.col(column_name).median().alias(f'median_{column_name}'),
        pl.col(column_name).std().alias(f'std_{column_name}'),
        pl.col(column_name).skew().alias(f'skew_{column_name}'),
        pl.col(column_name).kurtosis().alias(f'kurtosis_{column_name}')
    ]

In [8]:
df = df.with_columns(
    pl.when(pl.col('target') == 'Yes').then(1).otherwise(0).alias('target')
)

In [9]:
aggregate = [pl.col('amount').count().alias('transactions'), pl.col('target').sum().alias('frauds')]
for column in ['amount', 'current_age', 'per_capita_income', 'total_debt', 'credit_score', 'num_credit_cards', 'credit_limit']:
  aggregate.extend(generate_columns_statistics(column))

In [10]:
def list_structs_to_dict(list_of_structs, column_name):
  if list_of_structs is None:
      return None

  items = {}
  for item in list_of_structs:
    key = item[column_name]
    value = round(item['proportion'], 2)
    items[key] = value

  return items

def convert_categorical_to_percent(df: pl.DataFrame, column_name: str, group_by_column) -> pl.DataFrame:
  list_structs_to_dict_column = lambda items: list_structs_to_dict(items, column_name)

  df_group = df.group_by(group_by_column).agg(pl.col(column_name).value_counts(normalize=True)).sort(group_by_column)

  df_fields = df_group.with_columns(
    pl.col(column_name).map_elements(
        list_structs_to_dict_column,
        return_dtype=pl.Object
  ).alias(column_name))

  fields = df_fields[column_name].to_list()
  data = pd.DataFrame(fields).fillna(0.0)
  data.columns = [f'{column_name}_{col.lower().replace(" ", "_")}' for col in data.columns]
  # data[group_by_column] = df_fields[group_by_column].to_list()
  data = pl.from_pandas(data)
  data = data.with_columns(df_fields[group_by_column].alias(group_by_column))
  return data

In [11]:
df_model = df.group_by('date_window').agg(
    *aggregate
).sort('date_window')
df_model.head()

date_window,transactions,frauds,mean_amount,median_amount,std_amount,skew_amount,kurtosis_amount,mean_current_age,median_current_age,std_current_age,skew_current_age,kurtosis_current_age,mean_per_capita_income,median_per_capita_income,std_per_capita_income,skew_per_capita_income,kurtosis_per_capita_income,mean_total_debt,median_total_debt,std_total_debt,skew_total_debt,kurtosis_total_debt,mean_credit_score,median_credit_score,std_credit_score,skew_credit_score,kurtosis_credit_score,mean_num_credit_cards,median_num_credit_cards,std_num_credit_cards,skew_num_credit_cards,kurtosis_num_credit_cards,mean_credit_limit,median_credit_limit,std_credit_limit,skew_credit_limit,kurtosis_credit_limit
datetime[μs],u32,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2010-01-01 00:00:00,6,0,15.875,20.515,52.737742,-0.760483,-0.139116,46.833333,48.5,8.841191,-1.241314,0.444023,21897.166667,20877.5,7198.264219,0.522412,-0.861983,93656.666667,103463.0,28711.635012,-1.594374,0.842529,763.5,761.0,61.29845,0.001812,-1.536333,4.0,4.5,1.264911,-0.649519,-1.125,7200.833333,4594.5,8439.396078,0.381076,-1.520432
2010-01-01 00:15:00,4,0,22.2625,21.57,22.245049,0.012927,-1.980469,49.5,50.5,16.663333,-0.134765,-1.560644,24534.0,24325.0,1301.539345,0.354425,-1.461239,60605.5,62744.5,59834.776479,-0.01997,-1.967191,724.5,732.5,24.310492,-0.830693,-0.974824,4.25,4.5,0.957427,-0.493382,-1.371901,12608.5,12360.5,13718.592311,0.010111,-1.972188
2010-01-01 00:30:00,10,0,34.644,37.73,45.188942,-0.779579,0.46044,48.6,48.0,4.993329,-0.461034,0.602038,21695.6,19148.0,6709.572916,0.674351,-0.812142,56477.3,56937.0,48226.48252,0.235514,-1.124026,700.3,720.5,63.4421,-1.464207,2.09804,3.8,4.0,1.316561,-0.541972,0.607495,19807.9,19554.5,6338.6096,0.114952,-1.629159
2010-01-01 00:45:00,6,0,15.388333,11.185,14.262798,0.910151,-0.471315,70.5,68.5,18.251027,0.262932,-1.015413,19808.166667,18760.0,3592.733245,0.927645,-0.439468,40391.666667,17407.0,51012.043302,0.59751,-1.399093,772.166667,774.5,41.421814,-0.0949,-1.491414,5.333333,5.5,1.632993,-0.279508,-1.365,17533.666667,22097.0,11039.367893,-0.713338,-1.100947
2010-01-01 01:00:00,6,0,101.018333,82.545,97.728056,0.816882,-0.48396,54.5,52.5,9.648834,0.573632,-1.012784,26342.666667,23397.5,15248.228011,1.251008,0.412107,67331.166667,55213.0,35267.904737,0.400987,-1.525858,730.333333,728.0,83.946809,0.055991,-1.513659,3.833333,4.5,1.47196,-0.519038,-1.56071,23233.333333,15679.5,20657.262681,1.339601,0.337726


In [12]:
for column in ['use_chip', 'gender', 'card_brand', 'card_type', 'has_chip', 'online_sales']:
  df_field = convert_categorical_to_percent(df, column, 'date_window')
  df_model = df_model.join(df_field, on='date_window', how='left')
df_model.head(10)

date_window,transactions,frauds,mean_amount,median_amount,std_amount,skew_amount,kurtosis_amount,mean_current_age,median_current_age,std_current_age,skew_current_age,kurtosis_current_age,mean_per_capita_income,median_per_capita_income,std_per_capita_income,skew_per_capita_income,kurtosis_per_capita_income,mean_total_debt,median_total_debt,std_total_debt,skew_total_debt,kurtosis_total_debt,mean_credit_score,median_credit_score,std_credit_score,skew_credit_score,kurtosis_credit_score,mean_num_credit_cards,median_num_credit_cards,std_num_credit_cards,skew_num_credit_cards,kurtosis_num_credit_cards,mean_credit_limit,median_credit_limit,std_credit_limit,skew_credit_limit,…,"description_optometrists,_optical_goods_and_eyeglasses","description_insurance_sales,_underwriting",description_laundry_services,description_shoe_stores,description_postal_services_-_government_only,description_car_washes,description_legal_services_and_attorneys,description_bus_lines,"description_doctors,_physicians",description_chiropractors,"description_accounting,_auditing,_and_bookkeeping_services","description_heating,_plumbing,_air_conditioning_contractors",description_tax_preparation_services,"description_lighting,_fixtures,_electrical_supplies",description_lawn_and_garden_supply_stores,description_upholstery_and_drapery_stores,description_automotive_body_repair_shops,description_pottery_and_ceramics,description_steel_products_manufacturing,description_heat_treating_metal_services,description_automotive_parts_and_accessories_stores,description_digital_goods_-_games,description_steelworks,description_airlines,description_fabricated_structural_metal_products,description_towing_services,"description_bolt,_nut,_screw,_rivet_manufacturing",description_miscellaneous_metal_fabrication,description_steel_drums_and_barrels,"description_computers,_computer_peripheral_equipment",description_floor_covering_stores,description_sporting_goods_stores,description_cruise_lines,description_miscellaneous_fabricated_metal_products,description_music_stores_-_musical_instruments,description_household_appliance_stores,description_coated_and_laminated_products
datetime[μs],u32,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2010-01-01 00:00:00,6,0,15.875,20.515,52.737742,-0.760483,-0.139116,46.833333,48.5,8.841191,-1.241314,0.444023,21897.166667,20877.5,7198.264219,0.522412,-0.861983,93656.666667,103463.0,28711.635012,-1.594374,0.842529,763.5,761.0,61.29845,0.001812,-1.536333,4.0,4.5,1.264911,-0.649519,-1.125,7200.833333,4594.5,8439.396078,0.381076,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-01 00:15:00,4,0,22.2625,21.57,22.245049,0.012927,-1.980469,49.5,50.5,16.663333,-0.134765,-1.560644,24534.0,24325.0,1301.539345,0.354425,-1.461239,60605.5,62744.5,59834.776479,-0.01997,-1.967191,724.5,732.5,24.310492,-0.830693,-0.974824,4.25,4.5,0.957427,-0.493382,-1.371901,12608.5,12360.5,13718.592311,0.010111,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-01 00:30:00,10,0,34.644,37.73,45.188942,-0.779579,0.46044,48.6,48.0,4.993329,-0.461034,0.602038,21695.6,19148.0,6709.572916,0.674351,-0.812142,56477.3,56937.0,48226.48252,0.235514,-1.124026,700.3,720.5,63.4421,-1.464207,2.09804,3.8,4.0,1.316561,-0.541972,0.607495,19807.9,19554.5,6338.6096,0.114952,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-01 00:45:00,6,0,15.388333,11.185,14.262798,0.910151,-0.471315,70.5,68.5,18.251027,0.262932,-1.015413,19808.166667,18760.0,3592.733245,0.927645,-0.439468,40391.666667,17407.0,51012.043302,0.59751,-1.399093,772.166667,774.5,41.421814,-0.0949,-1.491414,5.333333,5.5,1.632993,-0.279508,-1.365,17533.666667,22097.0,11039.367893,-0.713338,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-01 01:00:00,6,0,101.018333,82.545,97.728056,0.816882,-0.48396,54.5,52.5,9.648834,0.573632,-1.012784,26342.666667,23397.5,15248.228011,1.251008,0.412107,67331.166667,55213.0,35267.904737,0.400987,-1.525858,730.333333,728.0,83.946809,0.055991,-1.513659,3.833333,4.5,1.47196,-0.519038,-1.56071,23233.333333,15679.5,20657.262681,1.339601,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-01 01:15:00,5,0,4.61,10.34,53.67871,-0.468739,-0.482601,43.6,47.0,7.402702,-0.754216,-0.771215,23935.4,24115.0,3065.888664,0.013807,-0.528075,81437.8,65994.0,28100.092281,1.039823,-0.491354,754.0,747.0,56.178288,1.098869,-0.206714,3.0,3.0,1.0,0.0,-1.75,14693.8,15094.0,4012.619369,-0.062165,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-01 01:30:00,4,0,17.535,13.425,17.586155,0.425266,-1.453984,67.0,63.5,20.314199,0.318187,-1.563732,25778.5,27677.5,6119.646749,-0.905558,-0.842706,30717.5,26900.0,34917.933869,0.121262,-1.83475,747.0,743.5,52.36411,0.19568,-1.262078,6.25,6.0,0.5,1.154701,-0.666667,24618.5,22170.5,11453.384085,0.474271,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-01 01:45:00,7,0,51.992857,39.05,34.98547,1.019653,-0.01202,56.428571,56.0,11.544943,-0.075079,-1.292404,25930.0,23172.0,12385.902524,1.407045,0.814717,51734.285714,51741.0,42299.149521,0.614604,-0.406747,726.285714,731.0,37.187427,-0.51775,-0.245965,4.285714,4.0,1.704336,-0.040073,-1.64781,21924.571429,16409.0,25060.956213,1.683264,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-01 02:00:00,9,0,45.843333,6.63,68.940091,1.30041,0.377724,50.0,41.0,21.75431,1.61035,1.389957,26292.888889,25431.0,9833.970033,0.085912,-1.386506,100010.666667,75415.0,65040.225749,0.263371,-0.811614,727.555556,737.0,42.009258,-1.000527,0.042923,3.222222,3.0,1.481366,0.334335,-0.386717,19692.888889,16900.0,10802.17467,1.035411,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-01 02:15:00,6,0,35.681667,42.3,27.537275,-0.289351,-1.570798,52.833333,49.5,13.556056,1.354773,0.540975,25255.666667,22606.5,9920.893239,0.376074,-1.370472,62937.0,66372.5,55313.380099,-0.00461,-1.55354,723.833333,713.5,37.833407,1.393735,0.511141,3.5,3.0,1.378405,1.003856,-0.049861,19874.0,16996.0,12746.423702,1.212342,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
df_proportion = df['description'].value_counts(normalize=True).sort('proportion', descending=True)
df_proportion.filter(pl.col('proportion') >= 0.01)

description,proportion
str,f64
"""Grocery Stores, Supermarkets""",0.119668
"""Miscellaneous Food Stores""",0.10986
"""Service Stations""",0.107137
"""Eating Places and Restaurants""",0.075133
"""Drug Stores and Pharmacies""",0.058037
…,…
"""Utilities - Electric, Gas, Wat…",0.018213
"""Book Stores""",0.017474
"""Telecommunication Services""",0.016404
"""Lumber and Building Materials""",0.013046


In [42]:
df.group_by("description").agg([
    pl.sum("amount").alias("total_amount"),
    pl.len().alias("transaction_count"),
]).with_columns(
    (
        2 / (1 / pl.col("total_amount") + 1 / pl.col("transaction_count"))
    ).alias("harmonic_mean_metrics")
).with_columns(
    (pl.col("total_amount") / pl.col("transaction_count")).alias("average_amount_per_transaction")
).join(df_proportion, on='description', how='left').to_pandas()

Unnamed: 0,description,total_amount,transaction_count,harmonic_mean_metrics,average_amount_per_transaction,proportion
0,Tax Preparation Services,458153.59,2523,5018.364435,181.590801,0.000283
1,Precious Stones and Metals,191606.12,3525,6922.643328,54.356346,0.000395
2,Pottery and Ceramics,1329395.01,1860,3714.802498,714.728500,0.000209
3,Tolls and Bridge Fees,15981764.25,451814,878784.245647,35.372441,0.050680
4,Family Clothing Stores,2680753.84,48647,95559.897354,55.106252,0.005457
...,...,...,...,...,...,...
103,Money Transfer,35572721.52,394401,780152.314515,90.194298,0.044240
104,Automotive Parts and Accessories Stores,312498.15,802,1599.894008,389.648566,0.000090
105,"Furniture, Home Furnishings, and Equipment Stores",836272.34,2600,5183.883126,321.643208,0.000292
106,Cruise Lines,415538.89,276,551.633606,1505.575688,0.000031
