In [18]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [19]:
data = pd.read_parquet('../Data/cleaned_training_data.parquet')
input_columns = [
    'log_cc_num', 'amt', 'gender', 'lat', 'long', 'age', 'log_unix_time',
    'merch_lat', 'merch_long', 'category', 'state']
output_columns = ['is_fraud']

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_val,Y_train,Y_val = train_test_split(data[input_columns], data[output_columns],test_size=0.3,random_state=42)

In [21]:
X_train.to_parquet('../encodedData/X_train.parquet')
X_val.to_parquet('../encodedData/X_val.parquet')

Y_train.to_parquet('../encodedData/Y_train.parquet')
Y_val.to_parquet('../encodedData/Y_val.parquet')

In [22]:
from sklearn.preprocessing import OneHotEncoder
columns_to_be_encoded = [ 'category', 'state']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(X_train[columns_to_be_encoded])
encoder.get_feature_names_out()

array(['category_entertainment', 'category_food_dining',
       'category_gas_transport', 'category_grocery_net',
       'category_grocery_pos', 'category_health_fitness', 'category_home',
       'category_kids_pets', 'category_misc_net', 'category_misc_pos',
       'category_personal_care', 'category_shopping_net',
       'category_shopping_pos', 'category_travel', 'state_AK', 'state_AL',
       'state_AR', 'state_AZ', 'state_CA', 'state_CO', 'state_CT',
       'state_DC', 'state_FL', 'state_GA', 'state_HI', 'state_IA',
       'state_ID', 'state_IL', 'state_IN', 'state_KS', 'state_KY',
       'state_LA', 'state_MA', 'state_MD', 'state_ME', 'state_MI',
       'state_MN', 'state_MO', 'state_MS', 'state_MT', 'state_NC',
       'state_ND', 'state_NE', 'state_NH', 'state_NJ', 'state_NM',
       'state_NV', 'state_NY', 'state_OH', 'state_OK', 'state_OR',
       'state_PA', 'state_RI', 'state_SC', 'state_SD', 'state_TN',
       'state_TX', 'state_UT', 'state_VA', 'state_VT', 'state_WA',
    

In [23]:
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())
encoded_df.to_parquet('../encodedData/encoaded_features_training_data.parquet')

In [24]:
numeric_cols = ['log_cc_num', 'amt', 'gender', 'lat', 'long', 'age', 'log_unix_time','merch_lat', 'merch_long']


In [25]:
print(encoded_df.shape)
print(X_train[numeric_cols].shape)
print(Y_train.shape)

(105000, 64)
(105000, 9)
(105000, 1)


In [26]:
# 1. Reset indexes so they align 0 to 104,999
numeric_df = X_train[numeric_cols].reset_index(drop=True)
encoded_df_reset = encoded_df.reset_index(drop=True)

# 2. Concat them now - they will align perfectly row-by-row
X_train_numeric = pd.concat([numeric_df, encoded_df_reset], axis=1)

# 3. Check the shape again
print(X_train_numeric.shape)

(105000, 73)


In [27]:
X_train_numeric.fillna(0).to_parquet('../encodedData/encoaded_training_data_full.parquet')

In [28]:
X_train_numeric.fillna(0)

Unnamed: 0,log_cc_num,amt,gender,lat,long,age,log_unix_time,merch_lat,merch_long,category_entertainment,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,27.065155,9.92,0,29.7736,-95.4034,36,21.014786,30.630424,-96.368085,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,36.059439,326.23,0,29.8868,-97.6769,66,21.036936,30.493020,-98.418856,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,35.956998,16.75,0,47.3551,-96.7980,48,21.037012,46.747835,-96.868348,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35.937125,28.28,0,41.3660,-98.0054,47,21.021172,40.498551,-98.316388,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,36.332483,48.33,0,45.6675,-93.2433,25,21.028415,44.794615,-93.086403,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104995,42.996395,141.88,1,48.8328,-108.3961,57,21.018389,47.901644,-108.438991,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104996,36.411519,140.48,0,32.2768,-95.3031,36,21.019877,33.010131,-94.686587,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104997,36.056399,90.86,0,34.9906,-81.8327,32,21.026569,34.599351,-82.098020,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104998,42.887785,65.92,1,20.0827,-155.4880,54,21.016394,20.624854,-155.239569,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
import gc
del X_train_numeric
gc.collect()

0

In [30]:
columns_to_be_encoded = [ 'category', 'state']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(X_val[columns_to_be_encoded])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())
encoded_df.to_parquet('../encodedData/encoaded_features_validation_data.parquet')
X_val_numeric = X_val[numeric_cols]
X_val_numeric
X_val_numeric = X_val_numeric.join(encoded_df)

# Then delete the old encoded_df to free up space
import gc
del encoded_df
gc.collect()

0

In [31]:
print(X_val_numeric.shape)
X_val_numeric.fillna(0).to_parquet('../encodedData/encoaded_validation_data_full.parquet')

(45000, 73)


In [32]:
import gc
del X_val_numeric
gc.collect()

0

In [33]:
pd.read_parquet('../encodedData/encoaded_features_training_data.parquet')

Unnamed: 0,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104995,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104996,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104997,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104998,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
Y_train

Unnamed: 0,is_fraud
27822,0
85202,1
42217,0
119958,0
37678,0
...,...
119879,0
103694,0
131932,0
146867,0
