# Import libraries and datasets

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

from utils.analysis_functions import find_differences, get_columns_with_missing_values
from utils.data_transformation_functions import transform_transaction_datetime, transform_transactionAmt, replace_card, group_email_domains
from utils.data_encoding_functions import one_hot_encode_columns, frequency_encoding
from utils.data_inputation_functions import DataImputer

In [2]:
train_identity_df = pd.read_csv('../data/train_identity.csv')
train_transaction_df = pd.read_csv('../data/train_transaction.csv')
test_identity_df = pd.read_csv('../data/test_identity.csv')
test_transaction_df = pd.read_csv('../data/test_transaction.csv')

In [3]:
train_data_df = train_transaction_df.merge(train_identity_df, how='left', on='TransactionID')
test_data_df = test_transaction_df.merge(test_identity_df, how='left', on='TransactionID')

del train_identity_df, train_transaction_df, test_identity_df, test_transaction_df

# Handle differences in column names

In [4]:
test_data_df.columns = test_data_df.columns.str.replace('^id-', 'id_', regex=True)
find_differences(train_data_df, test_data_df)

Different values between the lists: {'isFraud'}


## Handle columns with many missing values

In [5]:
columns_with_missing_data = get_columns_with_missing_values(train_data_df)

  return reduction(axis=axis, out=out, **passkwargs)


+---------------+----------------------+
| Column        |   Missing Percentage |
|---------------+----------------------|
| id_24         |             0.991962 |
| id_25         |             0.99131  |
| id_07         |             0.991271 |
| id_08         |             0.991271 |
| id_21         |             0.991264 |
| id_26         |             0.991257 |
| id_27         |             0.991247 |
| id_23         |             0.991247 |
| id_22         |             0.991247 |
| dist2         |             0.936284 |
| D7            |             0.934099 |
| id_18         |             0.923607 |
| D13           |             0.895093 |
| D14           |             0.894695 |
| D12           |             0.89041  |
| id_03         |             0.887689 |
| id_04         |             0.887689 |
| D6            |             0.876068 |
| id_33         |             0.875895 |
| id_10         |             0.873123 |
| id_09         |             0.873123 |
| D9            

In [6]:
train_data_df = train_data_df.drop(columns_with_missing_data, axis=1)
test_data_df = test_data_df.drop(columns_with_missing_data, axis=1)

In [7]:
print("Train data shape:", train_data_df.shape)
print("Test data shape:", test_data_df.shape)

Train data shape: (590540, 202)
Test data shape: (506691, 201)


## Handle transactionDT columns

As we explored, we can create new features such as days of the week and hours of the day `TransactionDT` column. 

In [8]:
train_data_df = transform_transaction_datetime(train_data_df)
test_data_df = transform_transaction_datetime(test_data_df)

In [9]:
train_data_df

Unnamed: 0,TransactionID,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V314,V315,V316,V317,V318,V319,V320,V321,Transaction_week_day,Transaction_hour
0,2987000,0,68.50,W,13926,,150.0,discover,142.0,credit,...,0.000000,0.000000,0.0,117.0,0.0,0.000000,0.000000,0.000000,5,0
1,2987001,0,29.00,W,2755,404.0,150.0,mastercard,102.0,credit,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,5,0
2,2987002,0,59.00,W,4663,490.0,150.0,visa,166.0,debit,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,5,0
3,2987003,0,50.00,W,18132,567.0,150.0,mastercard,117.0,debit,...,0.000000,0.000000,50.0,1404.0,790.0,0.000000,0.000000,0.000000,5,0
4,2987004,0,50.00,H,4497,514.0,150.0,mastercard,102.0,credit,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,49.00,W,6550,,150.0,visa,226.0,debit,...,47.950001,47.950001,0.0,0.0,0.0,0.000000,0.000000,0.000000,4,23
590536,3577536,0,39.50,W,10444,225.0,150.0,mastercard,224.0,debit,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,4,23
590537,3577537,0,30.95,W,12037,595.0,150.0,mastercard,224.0,debit,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,4,23
590538,3577538,0,117.00,W,7826,481.0,150.0,mastercard,224.0,debit,...,669.500000,317.500000,0.0,2234.0,0.0,0.000000,0.000000,0.000000,4,23


As we have seen in the exploration notebook, adding days of the week and hours of the day as features can help improve the model's performance.

## Handle transactionAmt columns

In [10]:
train_data_df = transform_transactionAmt(train_data_df)
test_data_df = transform_transactionAmt(test_data_df, isTest=True)

In [11]:
train_data_df

Unnamed: 0,TransactionID,isFraud,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,V315,V316,V317,V318,V319,V320,V321,Transaction_week_day,Transaction_hour,LogTransactionAmt
0,2987000,0,W,13926,,150.0,discover,142.0,credit,315.0,...,0.000000,0.0,117.0,0.0,0.000000,0.000000,0.000000,5,0,4.241327
1,2987001,0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,5,0,3.401197
2,2987002,0,W,4663,490.0,150.0,visa,166.0,debit,330.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,5,0,4.094345
3,2987003,0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,...,0.000000,50.0,1404.0,790.0,0.000000,0.000000,0.000000,5,0,3.931826
4,2987004,0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,5,0,3.931826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,W,6550,,150.0,visa,226.0,debit,272.0,...,47.950001,0.0,0.0,0.0,0.000000,0.000000,0.000000,4,23,3.912023
590536,3577536,0,W,10444,225.0,150.0,mastercard,224.0,debit,204.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,4,23,3.701302
590537,3577537,0,W,12037,595.0,150.0,mastercard,224.0,debit,231.0,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,4,23,3.464172
590538,3577538,0,W,7826,481.0,150.0,mastercard,224.0,debit,387.0,...,317.500000,0.0,2234.0,0.0,0.000000,0.000000,0.000000,4,23,4.770685


Employing the logaritmic transformation on the `TransactionAmt` column can help improve the model's performance, since in this way we can reduce the skewness of the data and approach a normal distribution.

## Handle ProductCD columns

In [12]:
train_data_df = one_hot_encode_columns(train_data_df, ['ProductCD'])
test_data_df = one_hot_encode_columns(test_data_df, ['ProductCD'])

In [13]:
train_data_df

Unnamed: 0,TransactionID,isFraud,card1,card2,card3,card4,card5,card6,addr1,addr2,...,V319,V320,V321,Transaction_week_day,Transaction_hour,LogTransactionAmt,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W
0,2987000,0,13926,,150.0,discover,142.0,credit,315.0,87.0,...,0.000000,0.000000,0.000000,5,0,4.241327,0,0,0,1
1,2987001,0,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,...,0.000000,0.000000,0.000000,5,0,3.401197,0,0,0,1
2,2987002,0,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,...,0.000000,0.000000,0.000000,5,0,4.094345,0,0,0,1
3,2987003,0,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,...,0.000000,0.000000,0.000000,5,0,3.931826,0,0,0,1
4,2987004,0,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,...,0.000000,0.000000,0.000000,5,0,3.931826,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,6550,,150.0,visa,226.0,debit,272.0,87.0,...,0.000000,0.000000,0.000000,4,23,3.912023,0,0,0,1
590536,3577536,0,10444,225.0,150.0,mastercard,224.0,debit,204.0,87.0,...,0.000000,0.000000,0.000000,4,23,3.701302,0,0,0,1
590537,3577537,0,12037,595.0,150.0,mastercard,224.0,debit,231.0,87.0,...,0.000000,0.000000,0.000000,4,23,3.464172,0,0,0,1
590538,3577538,0,7826,481.0,150.0,mastercard,224.0,debit,387.0,87.0,...,0.000000,0.000000,0.000000,4,23,4.770685,0,0,0,1


In [14]:
train_data_df[['ProductCD_H', 'ProductCD_R', 'ProductCD_S', 'ProductCD_W']].info()

<class 'pandas.core.frame.DataFrame'>
Index: 590538 entries, 0 to 590539
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   ProductCD_H  590538 non-null  int64
 1   ProductCD_R  590538 non-null  int64
 2   ProductCD_S  590538 non-null  int64
 3   ProductCD_W  590538 non-null  int64
dtypes: int64(4)
memory usage: 22.5 MB


## Handle card columns

In [15]:
card_columns = [col for col in train_data_df.columns if col.startswith('card')]

numerical_card_columns = [col for col in card_columns if pd.api.types.is_numeric_dtype(train_data_df[col])]
categorical_card_columns = [col for col in card_columns if not pd.api.types.is_numeric_dtype(train_data_df[col])]

In [16]:
data_imputer = DataImputer(strategy='mean')

train_data_df = data_imputer.impute_train_set_values(train_data_df, numerical_card_columns)
test_data_df = data_imputer.impute_test_set_values(test_data_df, numerical_card_columns)

Train missing values before imputation: 

card1       0
card2    8933
card3    1565
card5    4259
dtype: int64

Train missing values after imputation: 

card1    0
card2    0
card3    0
card5    0
dtype: int64

Test missing values before imputation: 

card1       0
card2    8654
card3    3002
card5    4547
dtype: int64

Test missing values after imputation: 

card1    0
card2    0
card3    0
card5    0
dtype: int64



In [17]:
data_imputer = DataImputer(strategy='most_frequent')

train_data_df = data_imputer.impute_train_set_values(train_data_df, categorical_card_columns)
test_data_df = data_imputer.impute_test_set_values(test_data_df, categorical_card_columns)

Train missing values before imputation: 

card4    1577
card6    1571
dtype: int64

Train missing values after imputation: 

card4    0
card6    0
dtype: int64

Test missing values before imputation: 

card4    3086
card6    3007
dtype: int64

Test missing values after imputation: 

card4    0
card6    0
dtype: int64



In [18]:
train_data_df['card6'] = train_data_df['card6'].apply(replace_card)
test_data_df['card6'] = test_data_df['card6'].apply(replace_card)

In [19]:
train_data_df = one_hot_encode_columns(train_data_df, ['card4', 'card6'])
test_data_df = one_hot_encode_columns(test_data_df, ['card4', 'card6'])

In [20]:
train_data_df

Unnamed: 0,TransactionID,isFraud,card1,card2,card3,card5,addr1,addr2,P_emaildomain,C1,...,Transaction_hour,LogTransactionAmt,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W,card4_discover,card4_mastercard,card4_visa,card6_debit
0,2987000,0,13926.0,362.554968,150.0,142.0,315.0,87.0,,1.0,...,0,4.241327,0,0,0,1,1,0,0,0
1,2987001,0,2755.0,404.000000,150.0,102.0,325.0,87.0,gmail.com,1.0,...,0,3.401197,0,0,0,1,0,1,0,0
2,2987002,0,4663.0,490.000000,150.0,166.0,330.0,87.0,outlook.com,1.0,...,0,4.094345,0,0,0,1,0,0,1,1
3,2987003,0,18132.0,567.000000,150.0,117.0,476.0,87.0,yahoo.com,2.0,...,0,3.931826,0,0,0,1,0,1,0,1
4,2987004,0,4497.0,514.000000,150.0,102.0,420.0,87.0,gmail.com,1.0,...,0,3.931826,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,6550.0,362.554968,150.0,226.0,272.0,87.0,,2.0,...,23,3.912023,0,0,0,1,0,0,1,1
590536,3577536,0,10444.0,225.000000,150.0,224.0,204.0,87.0,gmail.com,1.0,...,23,3.701302,0,0,0,1,0,1,0,1
590537,3577537,0,12037.0,595.000000,150.0,224.0,231.0,87.0,gmail.com,1.0,...,23,3.464172,0,0,0,1,0,1,0,1
590538,3577538,0,7826.0,481.000000,150.0,224.0,387.0,87.0,aol.com,1.0,...,23,4.770685,0,0,0,1,0,1,0,1


In [21]:
train_data_df[['card1', 'card2', 'card3', 'card5', 'card4_discover',	'card4_mastercard', 'card4_visa', 'card6_debit']].info()

<class 'pandas.core.frame.DataFrame'>
Index: 590538 entries, 0 to 590539
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   card1             590538 non-null  float64
 1   card2             590538 non-null  float64
 2   card3             590538 non-null  float64
 3   card5             590538 non-null  float64
 4   card4_discover    590538 non-null  int64  
 5   card4_mastercard  590538 non-null  int64  
 6   card4_visa        590538 non-null  int64  
 7   card6_debit       590538 non-null  int64  
dtypes: float64(4), int64(4)
memory usage: 40.5 MB


## Handle addr columns

In [22]:
data_imputer = DataImputer(strategy='mean')

train_data_df = data_imputer.impute_train_set_values(train_data_df, ['addr1', 'addr2'])
test_data_df = data_imputer.impute_test_set_values(test_data_df, ['addr1', 'addr2'])

Train missing values before imputation: 

addr1    65706
addr2    65706
dtype: int64

Train missing values after imputation: 

addr1    0
addr2    0
dtype: int64

Test missing values before imputation: 

addr1    65609
addr2    65609
dtype: int64

Test missing values after imputation: 

addr1    0
addr2    0
dtype: int64



In [23]:
train_data_df

Unnamed: 0,TransactionID,isFraud,card1,card2,card3,card5,addr1,addr2,P_emaildomain,C1,...,Transaction_hour,LogTransactionAmt,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W,card4_discover,card4_mastercard,card4_visa,card6_debit
0,2987000,0,13926.0,362.554968,150.0,142.0,315.0,87.0,,1.0,...,0,4.241327,0,0,0,1,1,0,0,0
1,2987001,0,2755.0,404.000000,150.0,102.0,325.0,87.0,gmail.com,1.0,...,0,3.401197,0,0,0,1,0,1,0,0
2,2987002,0,4663.0,490.000000,150.0,166.0,330.0,87.0,outlook.com,1.0,...,0,4.094345,0,0,0,1,0,0,1,1
3,2987003,0,18132.0,567.000000,150.0,117.0,476.0,87.0,yahoo.com,2.0,...,0,3.931826,0,0,0,1,0,1,0,1
4,2987004,0,4497.0,514.000000,150.0,102.0,420.0,87.0,gmail.com,1.0,...,0,3.931826,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,6550.0,362.554968,150.0,226.0,272.0,87.0,,2.0,...,23,3.912023,0,0,0,1,0,0,1,1
590536,3577536,0,10444.0,225.000000,150.0,224.0,204.0,87.0,gmail.com,1.0,...,23,3.701302,0,0,0,1,0,1,0,1
590537,3577537,0,12037.0,595.000000,150.0,224.0,231.0,87.0,gmail.com,1.0,...,23,3.464172,0,0,0,1,0,1,0,1
590538,3577538,0,7826.0,481.000000,150.0,224.0,387.0,87.0,aol.com,1.0,...,23,4.770685,0,0,0,1,0,1,0,1


In [24]:
train_data_df[['addr1', 'addr2']].info()

<class 'pandas.core.frame.DataFrame'>
Index: 590538 entries, 0 to 590539
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   addr1   590538 non-null  float64
 1   addr2   590538 non-null  float64
dtypes: float64(2)
memory usage: 13.5 MB


## Handle P_emaildomain columns

In [25]:
data_imputer = DataImputer(strategy='most_frequent')

train_data_df = data_imputer.impute_train_set_values(train_data_df, ['P_emaildomain'])
test_data_df = data_imputer.impute_test_set_values(test_data_df, ['P_emaildomain'])

Train missing values before imputation: 

P_emaildomain    94456
dtype: int64

Train missing values after imputation: 

P_emaildomain    0
dtype: int64

Test missing values before imputation: 

P_emaildomain    69192
dtype: int64

Test missing values after imputation: 

P_emaildomain    0
dtype: int64



In [26]:
train_data_df = group_email_domains(train_data_df, 'P_emaildomain')
test_data_df = group_email_domains(test_data_df, 'P_emaildomain')

In [27]:
train_data_df['P_emaildomain'].value_counts()

P_emaildomain
Google Mail       323307
Yahoo Mail        105303
Microsoft Mail     59477
anonymous.com      36998
aol.com            28289
Apple Mail          8225
comcast.net         7888
Others              7030
att.net             4033
sbcglobal.net       2970
verizon.net         2705
bellsouth.net       1909
cox.net             1393
optonline.net       1011
Name: count, dtype: int64

In [28]:
train_data_df = frequency_encoding(train_data_df, 'P_emaildomain')
test_data_df = frequency_encoding(test_data_df, 'P_emaildomain')

In [29]:
train_data_df

Unnamed: 0,TransactionID,isFraud,card1,card2,card3,card5,addr1,addr2,C1,C2,...,LogTransactionAmt,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W,card4_discover,card4_mastercard,card4_visa,card6_debit,P_emaildomain_freq
0,2987000,0,13926.0,362.554968,150.0,142.0,315.0,87.0,1.0,1.0,...,4.241327,0,0,0,1,1,0,0,0,323307
1,2987001,0,2755.0,404.000000,150.0,102.0,325.0,87.0,1.0,1.0,...,3.401197,0,0,0,1,0,1,0,0,323307
2,2987002,0,4663.0,490.000000,150.0,166.0,330.0,87.0,1.0,1.0,...,4.094345,0,0,0,1,0,0,1,1,59477
3,2987003,0,18132.0,567.000000,150.0,117.0,476.0,87.0,2.0,5.0,...,3.931826,0,0,0,1,0,1,0,1,105303
4,2987004,0,4497.0,514.000000,150.0,102.0,420.0,87.0,1.0,1.0,...,3.931826,1,0,0,0,0,1,0,0,323307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,6550.0,362.554968,150.0,226.0,272.0,87.0,2.0,1.0,...,3.912023,0,0,0,1,0,0,1,1,323307
590536,3577536,0,10444.0,225.000000,150.0,224.0,204.0,87.0,1.0,1.0,...,3.701302,0,0,0,1,0,1,0,1,323307
590537,3577537,0,12037.0,595.000000,150.0,224.0,231.0,87.0,1.0,1.0,...,3.464172,0,0,0,1,0,1,0,1,323307
590538,3577538,0,7826.0,481.000000,150.0,224.0,387.0,87.0,1.0,1.0,...,4.770685,0,0,0,1,0,1,0,1,28289


In [30]:
train_data_df['P_emaildomain_freq'].info()

<class 'pandas.core.series.Series'>
Index: 590538 entries, 0 to 590539
Series name: P_emaildomain_freq
Non-Null Count   Dtype
--------------   -----
590538 non-null  int64
dtypes: int64(1)
memory usage: 9.0 MB


## Handle C1-C14 columns 

In [31]:
c_cols = ['C' + str(i) for i in range(1, 15)]

train_data_df[c_cols].info()

<class 'pandas.core.frame.DataFrame'>
Index: 590538 entries, 0 to 590539
Data columns (total 14 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   C1      590538 non-null  float64
 1   C2      590538 non-null  float64
 2   C3      590538 non-null  float64
 3   C4      590538 non-null  float64
 4   C5      590538 non-null  float64
 5   C6      590538 non-null  float64
 6   C7      590538 non-null  float64
 7   C8      590538 non-null  float64
 8   C9      590538 non-null  float64
 9   C10     590538 non-null  float64
 10  C11     590538 non-null  float64
 11  C12     590538 non-null  float64
 12  C13     590538 non-null  float64
 13  C14     590538 non-null  float64
dtypes: float64(14)
memory usage: 67.6 MB


## Handle D1-D15 columns

In [32]:
d_cols = ['D1', 'D4', 'D10', 'D15']

In [33]:
data_imputer = DataImputer(strategy='mean')

train_data_df = data_imputer.impute_train_set_values(train_data_df, d_cols)
test_data_df = data_imputer.impute_test_set_values(test_data_df, d_cols)

Train missing values before imputation: 

D1       1269
D4     168922
D10     76022
D15     89113
dtype: int64

Train missing values after imputation: 

D1     0
D4     0
D10    0
D15    0
dtype: int64

Test missing values before imputation: 

D1      6031
D4     76851
D10    12545
D15    12069
dtype: int64

Test missing values after imputation: 

D1     0
D4     0
D10    0
D15    0
dtype: int64



In [34]:
train_data_df[d_cols]

Unnamed: 0,D1,D4,D10,D15
0,14.0,140.003105,13.000000,0.000000
1,0.0,0.000000,0.000000,0.000000
2,0.0,0.000000,0.000000,315.000000
3,112.0,94.000000,84.000000,111.000000
4,0.0,140.003105,123.982619,163.745232
...,...,...,...,...
590535,29.0,140.003105,56.000000,56.000000
590536,0.0,0.000000,0.000000,0.000000
590537,0.0,0.000000,0.000000,0.000000
590538,22.0,22.000000,22.000000,22.000000


In [35]:
train_data_df[d_cols].info()

<class 'pandas.core.frame.DataFrame'>
Index: 590538 entries, 0 to 590539
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   D1      590538 non-null  float64
 1   D4      590538 non-null  float64
 2   D10     590538 non-null  float64
 3   D15     590538 non-null  float64
dtypes: float64(4)
memory usage: 22.5 MB


## Handle M1-M9 columns

In [36]:
m_cols = ['M6']

In [37]:
data_imputer = DataImputer(strategy='most_frequent')

train_data_df = data_imputer.impute_train_set_values(train_data_df, m_cols)
test_data_df = data_imputer.impute_test_set_values(test_data_df, m_cols)

Train missing values before imputation: 

M6    169358
dtype: int64

Train missing values after imputation: 

M6    0
dtype: int64

Test missing values before imputation: 

M6    158939
dtype: int64

Test missing values after imputation: 

M6    0
dtype: int64



In [38]:
train_data_df = one_hot_encode_columns(train_data_df, ['M6'])
test_data_df = one_hot_encode_columns(test_data_df, ['M6'])

In [39]:
train_data_df

Unnamed: 0,TransactionID,isFraud,card1,card2,card3,card5,addr1,addr2,C1,C2,...,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W,card4_discover,card4_mastercard,card4_visa,card6_debit,P_emaildomain_freq,M6_T
0,2987000,0,13926.0,362.554968,150.0,142.0,315.0,87.0,1.0,1.0,...,0,0,0,1,1,0,0,0,323307,1
1,2987001,0,2755.0,404.000000,150.0,102.0,325.0,87.0,1.0,1.0,...,0,0,0,1,0,1,0,0,323307,1
2,2987002,0,4663.0,490.000000,150.0,166.0,330.0,87.0,1.0,1.0,...,0,0,0,1,0,0,1,1,59477,0
3,2987003,0,18132.0,567.000000,150.0,117.0,476.0,87.0,2.0,5.0,...,0,0,0,1,0,1,0,1,105303,0
4,2987004,0,4497.0,514.000000,150.0,102.0,420.0,87.0,1.0,1.0,...,1,0,0,0,0,1,0,0,323307,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,6550.0,362.554968,150.0,226.0,272.0,87.0,2.0,1.0,...,0,0,0,1,0,0,1,1,323307,0
590536,3577536,0,10444.0,225.000000,150.0,224.0,204.0,87.0,1.0,1.0,...,0,0,0,1,0,1,0,1,323307,1
590537,3577537,0,12037.0,595.000000,150.0,224.0,231.0,87.0,1.0,1.0,...,0,0,0,1,0,1,0,1,323307,1
590538,3577538,0,7826.0,481.000000,150.0,224.0,387.0,87.0,1.0,1.0,...,0,0,0,1,0,1,0,1,28289,1


In [40]:
train_data_df['M6_T'].info()

<class 'pandas.core.series.Series'>
Index: 590538 entries, 0 to 590539
Series name: M6_T
Non-Null Count   Dtype
--------------   -----
590538 non-null  int64
dtypes: int64(1)
memory usage: 9.0 MB


## V1-V339 columns

In [41]:
v_cols = [
    'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 
    'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 
    'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 
    'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 
    'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 
    'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 
    'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 
    'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 
    'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 
    'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 
    'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 
    'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 
    'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 
    'V279', 'V280', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 
    'V290', 'V291', 'V292', 'V293', 'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 
    'V300', 'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309', 
    'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 
    'V320', 'V321'
]

In [42]:
data_imputer = DataImputer(strategy='mean')

train_data_df = data_imputer.impute_train_set_values(train_data_df, v_cols)
test_data_df = data_imputer.impute_test_set_values(test_data_df, v_cols)

Train missing values before imputation: 

V12     76073
V13     76073
V14     76073
V15     76073
V16     76073
        ...  
V317       12
V318       12
V319       12
V320       12
V321       12
Length: 169, dtype: int64

Train missing values after imputation: 

V12     0
V13     0
V14     0
V15     0
V16     0
       ..
V317    0
V318    0
V319    0
V320    0
V321    0
Length: 169, dtype: int64

Test missing values before imputation: 

V12     12589
V13     12589
V14     12589
V15     12589
V16     12589
        ...  
V317        3
V318        3
V319        3
V320        3
V321        3
Length: 169, dtype: int64

Test missing values after imputation: 

V12     0
V13     0
V14     0
V15     0
V16     0
       ..
V317    0
V318    0
V319    0
V320    0
V321    0
Length: 169, dtype: int64



In [43]:
train_data_df[v_cols].info()

<class 'pandas.core.frame.DataFrame'>
Index: 590538 entries, 0 to 590539
Columns: 169 entries, V12 to V321
dtypes: float64(169)
memory usage: 765.9 MB


# Review final dataset

In [44]:
columns_with_missing_data_train = get_columns_with_missing_values(train_data_df)
columns_with_missing_data_test = get_columns_with_missing_values(test_data_df)

print("Columns with missing data in train dataset:", columns_with_missing_data_train)
print("Columns with missing data in test dataset:", columns_with_missing_data_test)

+----------+----------------------+
| Column   | Missing Percentage   |
|----------+----------------------|
+----------+----------------------+
+----------+----------------------+
| Column   | Missing Percentage   |
|----------+----------------------|
+----------+----------------------+
Columns with missing data in train dataset: []
Columns with missing data in test dataset: []


  return reduction(axis=axis, out=out, **passkwargs)


In [45]:
d_types_train = set(train_data_df.dtypes)
d_types_test = set(test_data_df.dtypes)

print("Data types in train dataset:", d_types_train)
print("Data types in test dataset:", d_types_test)

Data types in train dataset: {dtype('int32'), dtype('int64'), dtype('float64')}
Data types in test dataset: {dtype('int32'), dtype('int64'), dtype('float64')}


We will drop `TransactionID` from the training dataset since it is an identifier and does not provide any information to the model. However, we will keep it in the test dataset to create the submission file.

In [46]:
train_data_df = train_data_df.drop(['TransactionID'], axis=1)

In [47]:
find_differences(train_data_df, test_data_df)

Different values between the lists: {'isFraud', 'TransactionID'}


In [48]:
train_data_df

Unnamed: 0,isFraud,card1,card2,card3,card5,addr1,addr2,C1,C2,C3,...,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W,card4_discover,card4_mastercard,card4_visa,card6_debit,P_emaildomain_freq,M6_T
0,0,13926.0,362.554968,150.0,142.0,315.0,87.0,1.0,1.0,0.0,...,0,0,0,1,1,0,0,0,323307,1
1,0,2755.0,404.000000,150.0,102.0,325.0,87.0,1.0,1.0,0.0,...,0,0,0,1,0,1,0,0,323307,1
2,0,4663.0,490.000000,150.0,166.0,330.0,87.0,1.0,1.0,0.0,...,0,0,0,1,0,0,1,1,59477,0
3,0,18132.0,567.000000,150.0,117.0,476.0,87.0,2.0,5.0,0.0,...,0,0,0,1,0,1,0,1,105303,0
4,0,4497.0,514.000000,150.0,102.0,420.0,87.0,1.0,1.0,0.0,...,1,0,0,0,0,1,0,0,323307,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,0,6550.0,362.554968,150.0,226.0,272.0,87.0,2.0,1.0,0.0,...,0,0,0,1,0,0,1,1,323307,0
590536,0,10444.0,225.000000,150.0,224.0,204.0,87.0,1.0,1.0,0.0,...,0,0,0,1,0,1,0,1,323307,1
590537,0,12037.0,595.000000,150.0,224.0,231.0,87.0,1.0,1.0,0.0,...,0,0,0,1,0,1,0,1,323307,1
590538,0,7826.0,481.000000,150.0,224.0,387.0,87.0,1.0,1.0,0.0,...,0,0,0,1,0,1,0,1,28289,1


In [49]:
test_data_df

Unnamed: 0,TransactionID,card1,card2,card3,card5,addr1,addr2,C1,C2,C3,...,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W,card4_discover,card4_mastercard,card4_visa,card6_debit,P_emaildomain_freq,M6_T
0,3663549,10409.0,111.0,150.0,226.0,170.000000,87.00000,6.0,6.0,0.0,...,0,0,0,1,0,0,1,1,277137,0
1,3663550,4272.0,111.0,150.0,226.0,299.000000,87.00000,3.0,2.0,0.0,...,0,0,0,1,0,0,1,1,24048,0
2,3663551,4476.0,574.0,150.0,226.0,472.000000,87.00000,2.0,2.0,0.0,...,0,0,0,1,0,0,1,1,53510,0
3,3663552,10989.0,360.0,150.0,166.0,205.000000,87.00000,5.0,2.0,0.0,...,0,0,0,1,0,0,1,1,277137,1
4,3663553,18018.0,452.0,150.0,117.0,264.000000,87.00000,6.0,6.0,0.0,...,0,0,0,1,0,1,0,1,277137,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506686,4170235,13832.0,375.0,185.0,224.0,284.000000,60.00000,1.0,1.0,0.0,...,0,0,0,0,0,1,0,1,277137,0
506687,4170236,3154.0,408.0,185.0,224.0,290.734121,86.80063,1.0,3.0,0.0,...,0,0,0,0,0,1,0,1,53510,0
506688,4170237,16661.0,490.0,150.0,226.0,327.000000,87.00000,1.0,1.0,0.0,...,0,0,0,1,0,0,1,1,53510,0
506689,4170238,16621.0,516.0,150.0,224.0,177.000000,87.00000,1.0,1.0,0.0,...,0,0,0,1,0,1,0,1,53510,0


In [50]:
train_data_df.to_csv('../data/train_data_processed.csv', index=False)
test_data_df.to_csv('../data/test_data_processed.csv', index=False)