In [1]:
import os
import shutil
import polars as pl
from datetime import datetime, timedelta

In [40]:
if 'google.colab' in str(get_ipython()):
  from google.colab import drive

  os.makedirs('./datasets', exist_ok=True)
  drive.mount('/content/drive')

  shutil.copy('/content/drive/MyDrive/DataScience/Analytics/Estudo Fraude/df_train_test.parquet','/content/datasets/')
  shutil.copy('/content/drive/MyDrive/DataScience/Analytics/Estudo Fraude/df_validation.parquet','/content/datasets/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
df = pl.concat([pl.read_parquet('./datasets/df_train_test.parquet'), pl.read_parquet('./datasets/df_validation.parquet')])
df.head()

id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors,description,target,current_age,retirement_age,birth_year,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web
i64,datetime[ns],i64,i64,f64,str,i64,str,str,f64,str,str,str,str,i64,i64,i64,i64,str,str,f64,f64,f64,f64,f64,i64,i64,str,str,i64,str,i64,str,i64,f64,str,i64,str
7475327,2010-01-01 00:01:00,1556,2972,-77.0,"""Swipe Transaction""",59935,"""Beulah""","""ND""",58523.0,"""5499""",,"""Miscellaneous Food Stores""","""No""",30,67,1989,7,"""Female""","""594 Mountain View Street""",46.8,-100.76,23679.0,48277.0,110153.0,740,4,"""Mastercard""","""Debit (Prepaid)""",5497590243197280,"""07/2022""",306,"""YES""",2,55.0,"""05/2008""",2008,"""No"""
7475328,2010-01-01 00:02:00,561,4575,14.57,"""Swipe Transaction""",67570,"""Bettendorf""","""IA""",52722.0,"""5311""",,"""Department Stores""","""No""",48,67,1971,6,"""Male""","""604 Pine Street""",40.8,-91.12,18076.0,36853.0,112139.0,834,5,"""Mastercard""","""Credit""",5175842699412235,"""12/2024""",438,"""YES""",1,9100.0,"""09/2005""",2015,"""No"""
7475329,2010-01-01 00:02:00,1129,102,80.0,"""Swipe Transaction""",27092,"""Vista""","""CA""",92084.0,"""4829""",,"""Money Transfer""","""No""",49,65,1970,4,"""Male""","""2379 Forest Lane""",33.18,-117.29,16894.0,34449.0,36540.0,686,3,"""Mastercard""","""Debit""",5874992802287595,"""05/2020""",256,"""YES""",1,14802.0,"""01/2006""",2008,"""No"""
7475332,2010-01-01 00:06:00,848,3915,46.41,"""Swipe Transaction""",13051,"""Harwood""","""MD""",20776.0,"""5813""",,"""Drinking Places (Alcoholic Bev…","""No""",51,69,1968,5,"""Male""","""166 River Drive""",38.86,-76.6,33529.0,68362.0,96182.0,711,2,"""Visa""","""Debit""",4354185735186651,"""01/2020""",120,"""YES""",1,19113.0,"""07/2009""",2014,"""No"""
7475333,2010-01-01 00:07:00,1807,165,4.81,"""Swipe Transaction""",20519,"""Bronx""","""NY""",10464.0,"""5942""",,"""Book Stores""","""No""",47,65,1972,12,"""Female""","""14780 Plum Lane""",40.84,-73.87,25537.0,52065.0,98613.0,828,5,"""Mastercard""","""Debit (Prepaid)""",5207231566469664,"""03/2014""",198,"""YES""",1,89.0,"""01/2008""",2015,"""No"""


In [42]:
def floor_to_15min(dt_value: datetime) -> datetime|None:
    if dt_value is None:
        return None

    seconds_since_midnight = (dt_value - dt_value.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds()
    interval_seconds = 15 * 60
    floored_seconds = (seconds_since_midnight // interval_seconds) * interval_seconds
    return dt_value.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(seconds=floored_seconds)

In [43]:
df = df.with_columns(
    pl.col("date").map_elements(floor_to_15min, return_dtype=pl.Datetime).alias("date_window")
)

In [44]:
def generate_columns_statistics(column_name: str) -> list:
    return [
        pl.col(column_name).mean().alias(f'mean_{column_name}'),
        pl.col(column_name).median().alias(f'median_{column_name}'),
        pl.col(column_name).std().alias(f'std_{column_name}'),
        pl.col(column_name).skew().alias(f'skew_{column_name}'),
        pl.col(column_name).kurtosis().alias(f'kurtosis_{column_name}')
    ]

In [45]:
df = df.with_columns(
    pl.when(pl.col('target') == 'Yes').then(1).otherwise(0).alias('target')
)

In [46]:
aggregate = [pl.col('amount').count().alias('transactions')]
for column in ['amount', 'current_age', 'per_capita_income', 'yearly_income', 'total_debt', 'credit_score', 'num_credit_cards', 'credit_limit']:
  aggregate.extend(generate_columns_statistics(column))
aggregate.append(pl.col('target').sum().alias('frauds'))

In [47]:
df_model = df.group_by('date_window').agg(
    *aggregate
).sort('date_window')
df_model.head()

date_window,transactions,mean_amount,median_amount,std_amount,skew_amount,kurtosis_amount,mean_current_age,median_current_age,std_current_age,skew_current_age,kurtosis_current_age,mean_per_capita_income,median_per_capita_income,std_per_capita_income,skew_per_capita_income,kurtosis_per_capita_income,mean_yearly_income,median_yearly_income,std_yearly_income,skew_yearly_income,kurtosis_yearly_income,mean_total_debt,median_total_debt,std_total_debt,skew_total_debt,kurtosis_total_debt,mean_credit_score,median_credit_score,std_credit_score,skew_credit_score,kurtosis_credit_score,mean_num_credit_cards,median_num_credit_cards,std_num_credit_cards,skew_num_credit_cards,kurtosis_num_credit_cards,mean_credit_limit,median_credit_limit,std_credit_limit,skew_credit_limit,kurtosis_credit_limit,frauds
datetime[μs],u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32
2010-01-01 00:00:00,6,15.875,20.515,52.737742,-0.760483,-0.139116,46.833333,48.5,8.841191,-1.241314,0.444023,21897.166667,20877.5,7198.264219,0.522412,-0.861983,44644.5,42565.0,14677.331144,0.522292,-0.861697,93656.666667,103463.0,28711.635012,-1.594374,0.842529,763.5,761.0,61.29845,0.001812,-1.536333,4.0,4.5,1.264911,-0.649519,-1.125,7200.833333,4594.5,8439.396078,0.381076,-1.520432,0
2010-01-01 00:15:00,4,22.2625,21.57,22.245049,0.012927,-1.980469,49.5,50.5,16.663333,-0.134765,-1.560644,24534.0,24325.0,1301.539345,0.354425,-1.461239,45034.5,47909.5,9731.403205,-0.902371,-0.811235,60605.5,62744.5,59834.776479,-0.01997,-1.967191,724.5,732.5,24.310492,-0.830693,-0.974824,4.25,4.5,0.957427,-0.493382,-1.371901,12608.5,12360.5,13718.592311,0.010111,-1.972188,0
2010-01-01 00:30:00,10,34.644,37.73,45.188942,-0.779579,0.46044,48.6,48.0,4.993329,-0.461034,0.602038,21695.6,19148.0,6709.572916,0.674351,-0.812142,44235.6,39041.5,13679.761322,0.674422,-0.812128,56477.3,56937.0,48226.48252,0.235514,-1.124026,700.3,720.5,63.4421,-1.464207,2.09804,3.8,4.0,1.316561,-0.541972,0.607495,19807.9,19554.5,6338.6096,0.114952,-1.629159,0
2010-01-01 00:45:00,6,15.388333,11.185,14.262798,0.910151,-0.471315,70.5,68.5,18.251027,0.262932,-1.015413,19808.166667,18760.0,3592.733245,0.927645,-0.439468,30264.5,34182.5,11372.35866,-0.676415,-0.682009,40391.666667,17407.0,51012.043302,0.59751,-1.399093,772.166667,774.5,41.421814,-0.0949,-1.491414,5.333333,5.5,1.632993,-0.279508,-1.365,17533.666667,22097.0,11039.367893,-0.713338,-1.100947,0
2010-01-01 01:00:00,6,101.018333,82.545,97.728056,0.816882,-0.48396,54.5,52.5,9.648834,0.573632,-1.012784,26342.666667,23397.5,15248.228011,1.251008,0.412107,51599.833333,44073.0,31446.148479,1.418138,0.648661,67331.166667,55213.0,35267.904737,0.400987,-1.525858,730.333333,728.0,83.946809,0.055991,-1.513659,3.833333,4.5,1.47196,-0.519038,-1.56071,23233.333333,15679.5,20657.262681,1.339601,0.337726,0


In [48]:
df_model['frauds'].value_counts().sort('frauds')

frauds,count
i32,u32
0,333185
1,9012
2,1713
3,234
4,43
5,4


In [49]:
df_model.shape

(344191, 43)

In [50]:
unique_use_chip = df.select('use_chip').unique().to_numpy().squeeze()

In [64]:
def list_structs_to_dict(list_of_structs):
    if list_of_structs is None: # Trata casos onde a lista inteira é null
        return None

    items = {}
    for item in list_of_structs:
      key = item['use_chip']
      value = round(item['proportion'], 2)
      items[key] = value
    return items

In [65]:
df_use_chip = df.group_by('date_window').agg(pl.col('use_chip').value_counts(normalize=True)).sort('date_window')
df_use_chip.head()

date_window,use_chip
datetime[μs],list[struct[2]]
2010-01-01 00:00:00,"[{""Swipe Transaction"",0.833333}, {""Online Transaction"",0.166667}]"
2010-01-01 00:15:00,"[{""Swipe Transaction"",1.0}]"
2010-01-01 00:30:00,"[{""Swipe Transaction"",0.8}, {""Online Transaction"",0.2}]"
2010-01-01 00:45:00,"[{""Swipe Transaction"",0.833333}, {""Online Transaction"",0.166667}]"
2010-01-01 01:00:00,"[{""Swipe Transaction"",0.5}, {""Online Transaction"",0.5}]"


In [67]:
 df_use_chip.with_columns(
  pl.col('use_chip').map_elements(
      list_structs_to_dict,
      return_dtype=pl.Object
).alias('use_chip'))

date_window,use_chip
datetime[μs],object
2010-01-01 00:00:00,"{'Swipe Transaction': 0.83, 'Online Transaction': 0.17}"
2010-01-01 00:15:00,{'Swipe Transaction': 1.0}
2010-01-01 00:30:00,"{'Swipe Transaction': 0.8, 'Online Transaction': 0.2}"
2010-01-01 00:45:00,"{'Swipe Transaction': 0.83, 'Online Transaction': 0.17}"
2010-01-01 01:00:00,"{'Swipe Transaction': 0.5, 'Online Transaction': 0.5}"
…,…
2019-10-31 22:45:00,"{'Online Transaction': 0.06, 'Chip Transaction': 0.88, 'Swipe Transaction': 0.06}"
2019-10-31 23:00:00,"{'Swipe Transaction': 0.33, 'Chip Transaction': 0.67}"
2019-10-31 23:15:00,"{'Online Transaction': 0.4, 'Chip Transaction': 0.6}"
2019-10-31 23:30:00,"{'Online Transaction': 0.14, 'Chip Transaction': 0.43, 'Swipe Transaction': 0.43}"


In [15]:
items = [{"Swipe Transaction",0.833333}, {"Online Transaction",0.166667}]

In [16]:
list_structs_to_dict(items)

{'Swipe Transaction': 0.833333, 'Online Transaction': 0.166667}