In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_parquet('../Data/cleaned_training_data.parquet')
input_columns = [
    'log_cc_num', 'amt', 'gender', 'lat', 'long', 'age', 'log_unix_time',
    'merch_lat', 'merch_long', 'category', 'state']
output_columns = ['is_fraud']

In [3]:
from sklearn.model_selection import train_test_split
X_train,X_val,Y_train,Y_val = train_test_split(data[input_columns], data[output_columns],test_size=0.3,random_state=42)

In [4]:
X_train.to_parquet('../encodedData/X_train.parquet')
X_val.to_parquet('../encodedData/X_val.parquet')

Y_train.to_parquet('../encodedData/Y_train.parquet')
Y_val.to_parquet('../encodedData/Y_val.parquet')

In [5]:
from sklearn.preprocessing import OneHotEncoder
columns_to_be_encoded = [ 'category', 'state']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(X_train[columns_to_be_encoded])
encoder.get_feature_names_out()

array(['category_entertainment', 'category_food_dining',
       'category_gas_transport', 'category_grocery_net',
       'category_grocery_pos', 'category_health_fitness', 'category_home',
       'category_kids_pets', 'category_misc_net', 'category_misc_pos',
       'category_personal_care', 'category_shopping_net',
       'category_shopping_pos', 'category_travel', 'state_AK', 'state_AL',
       'state_AR', 'state_AZ', 'state_CA', 'state_CO', 'state_CT',
       'state_DC', 'state_FL', 'state_GA', 'state_HI', 'state_IA',
       'state_ID', 'state_IL', 'state_IN', 'state_KS', 'state_KY',
       'state_LA', 'state_MA', 'state_MD', 'state_ME', 'state_MI',
       'state_MN', 'state_MO', 'state_MS', 'state_MT', 'state_NC',
       'state_ND', 'state_NE', 'state_NH', 'state_NJ', 'state_NM',
       'state_NV', 'state_NY', 'state_OH', 'state_OK', 'state_OR',
       'state_PA', 'state_RI', 'state_SC', 'state_SD', 'state_TN',
       'state_TX', 'state_UT', 'state_VA', 'state_VT', 'state_WA',
    

In [6]:
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())
encoded_df.to_parquet('../encodedData/encoaded_features_training_data.parquet')

In [7]:
numeric_cols = ['log_cc_num', 'amt', 'gender', 'lat', 'long', 'age', 'log_unix_time','merch_lat', 'merch_long']


In [8]:
print(encoded_df.shape)
print(X_train[numeric_cols].shape)
print(Y_train.shape)

(105000, 64)
(105000, 9)
(105000, 1)


In [9]:
X_train_numeric = pd.concat([X_train[numeric_cols],encoded_df] , axis=1)

# Then delete the old encoded_df to free up space
import gc
del encoded_df
gc.collect()

0

In [10]:
X_train_numeric.fillna(0).to_parquet('../encodedData/encoaded_training_data_full.parquet')

In [None]:
X_train_numeric.fillna(0)

Unnamed: 0,log_cc_num,amt,gender,lat,long,age,log_unix_time,merch_lat,merch_long,category_entertainment,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
27822,27.065155,9.92,0.0,29.7736,-95.4034,36.0,21.014786,30.630424,-96.368085,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85202,36.059439,326.23,0.0,29.8868,-97.6769,66.0,21.036936,30.493020,-98.418856,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42217,35.956998,16.75,0.0,47.3551,-96.7980,48.0,21.037012,46.747835,-96.868348,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119958,35.937125,28.28,0.0,41.3660,-98.0054,47.0,21.021172,40.498551,-98.316388,,...,,,,,,,,,,
37678,36.332483,48.33,0.0,45.6675,-93.2433,25.0,21.028415,44.794615,-93.086403,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104989,,,,,,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104990,,,,,,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104996,,,,,,,,,,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104998,,,,,,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
import gc
del X_train_numeric
gc.collect()

0

In [13]:
columns_to_be_encoded = [ 'category', 'state']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(X_val[columns_to_be_encoded])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())
encoded_df.to_parquet('../encodedData/encoaded_features_validation_data.parquet')
X_val_numeric = X_val[numeric_cols]
X_val_numeric
X_val_numeric = X_val_numeric.join(encoded_df)

# Then delete the old encoded_df to free up space
import gc
del encoded_df
gc.collect()

0

In [14]:
print(X_val_numeric.shape)
X_val_numeric.fillna(0).to_parquet('../encodedData/encoaded_validation_data_full.parquet')

(45000, 73)


In [15]:
import gc
del X_val_numeric
gc.collect()

0

In [16]:
pd.read_parquet('../encodedData/encoaded_features_training_data.parquet')

Unnamed: 0,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104995,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104996,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104997,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104998,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
Y_train

Unnamed: 0,is_fraud
27822,0
85202,1
42217,0
119958,0
37678,0
...,...
119879,0
103694,0
131932,0
146867,0
