In [31]:
import pandas as pd
from imblearn.over_sampling import SMOTE
import pickle
import os

In [3]:
data_path = "data/processed/data_processed_v0.csv"

In [5]:
os.chdir("../")

In [6]:
df = pd.read_csv(data_path)

In [7]:
df.head()

Unnamed: 0,merchant,category,amt,gender,state,zip,city_pop,job,unix_time,is_fraud,...,day,hour,dayofweek,merchant_encoded,category_encoded,gender_encoded,job_encoded,state_encoded,age,distance_km
0,fraud_Kirlin and Sons,personal_care,2.86,M,SC,29209,333497,Mechanical engineer,1371816865,0,...,21,12,6,0.001959,130085,0,8062,0.005655,57,24.613746
1,fraud_Sporer-Keebler,personal_care,29.84,F,UT,84002,302,"Sales professional, IT",1371816873,0,...,21,12,6,0.002299,130085,1,8052,0.003972,35,104.834043
2,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,F,NY,11710,34496,"Librarian, public",1371816893,0,...,21,12,6,0.001214,122553,1,8773,0.006113,55,59.204796
3,fraud_Haley Group,misc_pos,60.05,M,FL,32780,54767,Set designer,1371816915,0,...,21,12,6,0.004274,114229,0,2208,0.005496,38,27.615117
4,fraud_Johnston-Casper,travel,3.19,M,MI,49632,1126,Furniture designer,1371816917,0,...,21,12,6,0.001665,57956,0,2934,0.004542,70,104.423175


In [8]:
y = df['is_fraud']

In [9]:
df.columns

Index(['merchant', 'category', 'amt', 'gender', 'state', 'zip', 'city_pop',
       'job', 'unix_time', 'is_fraud', 'year', 'month', 'day', 'hour',
       'dayofweek', 'merchant_encoded', 'category_encoded', 'gender_encoded',
       'job_encoded', 'state_encoded', 'age', 'distance_km'],
      dtype='object')

In [10]:
x = df.drop(columns=['is_fraud'])

In [11]:
sm = SMOTE(random_state=42)

In [12]:
x.shape, y.shape

((1852394, 21), (1852394,))

In [13]:
y.value_counts()

is_fraud
0    1842743
1       9651
Name: count, dtype: int64

In [15]:
x.dtypes

merchant             object
category             object
amt                 float64
gender               object
state                object
zip                   int64
city_pop              int64
job                  object
unix_time             int64
year                  int64
month                 int64
day                   int64
hour                  int64
dayofweek             int64
merchant_encoded    float64
category_encoded      int64
gender_encoded        int64
job_encoded           int64
state_encoded       float64
age                   int64
distance_km         float64
dtype: object

In [21]:
merchant_map = df[['merchant', 'merchant_encoded']].drop_duplicates().set_index('merchant').to_dict()['merchant_encoded']

In [26]:
category_map = df[['category', 'category_encoded']].drop_duplicates().set_index('category').to_dict()['category_encoded']

In [28]:
gender_map = df[['gender','gender_encoded']].drop_duplicates().set_index('gender').to_dict()['gender_encoded']

In [29]:
job_map = df[['job','job_encoded']].drop_duplicates().set_index('job').to_dict()['job_encoded']

In [30]:
state_map = df[['state','state_encoded']].drop_duplicates().set_index('state').to_dict()['state_encoded']

In [32]:
with open("artifacts/maps/merchant_map.pkl", 'wb') as file:
    pickle.dump(merchant_map, file)

In [33]:
with open("artifacts/maps/category_map.pkl", 'wb') as file:
    pickle.dump(category_map, file)

In [34]:
with open("artifacts/maps/gender_map.pkl", 'wb') as file:
    pickle.dump(gender_map, file)

In [35]:
with open("artifacts/maps/job_map.pkl", 'wb') as file:
    pickle.dump(job_map, file)

In [36]:
with open("artifacts/maps/state_map.pkl", 'wb') as file:
    pickle.dump(state_map, file)

In [37]:
x.columns

Index(['merchant', 'category', 'amt', 'gender', 'state', 'zip', 'city_pop',
       'job', 'unix_time', 'year', 'month', 'day', 'hour', 'dayofweek',
       'merchant_encoded', 'category_encoded', 'gender_encoded', 'job_encoded',
       'state_encoded', 'age', 'distance_km'],
      dtype='object')

In [38]:
x = x.drop(columns=['merchant', 'category', 'gender', 'job', 'state' ])

In [39]:
X_res, y_res = sm.fit_resample(x, y)

In [40]:
X_res.shape, y_res.shape

((3685486, 16), (3685486,))

In [41]:
y_res.value_counts()

is_fraud
0    1842743
1    1842743
Name: count, dtype: int64

In [44]:
x.tail()

Unnamed: 0,amt,zip,city_pop,unix_time,year,month,day,hour,dayofweek,merchant_encoded,category_encoded,gender_encoded,job_encoded,state_encoded,age,distance_km
1852389,15.56,84735,258,1371816728,2020,6,21,12,6,0.002935,134118,0,8041,0.003972,64,119.696415
1852390,51.7,21790,100,1371816739,2020,6,21,12,6,0.001592,130729,0,738,0.005436,46,75.202184
1852391,105.93,88325,899,1371816752,2020,6,21,12,6,0.001157,130729,0,12434,0.005165,58,98.987927
1852392,74.9,57756,1126,1371816816,2020,6,21,12,6,0.001491,130729,0,2916,0.005235,45,84.688356
1852393,4.3,59871,218,1371816817,2020,6,21,12,6,0.001946,130729,0,3676,0.004106,30,83.845902


In [45]:
X_res.tail()

Unnamed: 0,amt,zip,city_pop,unix_time,year,month,day,hour,dayofweek,merchant_encoded,category_encoded,gender_encoded,job_encoded,state_encoded,age,distance_km
3685481,9.110401,62096,532,1339353539,2019,6,10,17,0,0.005719,59398,0,5854,0.005764,57,60.796048
3685482,733.194909,24983,2443,1354748413,2019,12,5,22,3,0.011019,145444,0,10211,0.003997,84,98.335683
3685483,301.134297,16028,2489,1326243560,2019,1,11,0,4,0.009745,176191,0,2391,0.005033,78,98.619993
3685484,1129.66422,68723,566,1361659590,2020,2,23,22,6,0.014152,139322,0,4398,0.006275,50,74.144888
3685485,656.202826,58687,389,1363042826,2020,3,11,22,2,0.006265,135407,1,4039,0.005228,54,70.40025


In [46]:
df = X_res.copy()

In [47]:
df['is_fraud']= y_res

In [48]:
df.shape

(3685486, 17)

In [49]:
df.to_csv("data/processed/data_resampled_v1.csv", index=False)