In [1]:
import pandas as pd
import os
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
idee = pd.read_csv('train_identity.csv')
transaction = pd.read_csv('train_transaction.csv')

In [4]:
merge = transaction.merge(idee, how='outer', on='TransactionID')

In [5]:
merge.dtypes.value_counts()

float64    399
object      31
int64        4
dtype: int64

In [6]:
objects = merge.select_dtypes('object')
objects.astype('category')
numbers = merge.select_dtypes(include=['float64', 'int64'])
print(objects.info())
print('###')
print(numbers.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 0 to 590539
Data columns (total 31 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   ProductCD      590540 non-null  object
 1   card4          588963 non-null  object
 2   card6          588969 non-null  object
 3   P_emaildomain  496084 non-null  object
 4   R_emaildomain  137291 non-null  object
 5   M1             319440 non-null  object
 6   M2             319440 non-null  object
 7   M3             319440 non-null  object
 8   M4             309096 non-null  object
 9   M5             240058 non-null  object
 10  M6             421180 non-null  object
 11  M7             244275 non-null  object
 12  M8             244288 non-null  object
 13  M9             244288 non-null  object
 14  id_12          144233 non-null  object
 15  id_15          140985 non-null  object
 16  id_16          129340 non-null  object
 17  id_23          5169 non-null    object
 18  id_2

In [7]:
objects.fillna("Unknown", inplace=True)
objects.head()

Unnamed: 0,ProductCD,card4,card6,P_emaildomain,R_emaildomain,M1,M2,M3,M4,M5,M6,M7,M8,M9,id_12,id_15,id_16,id_23,id_27,id_28,id_29,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,W,discover,credit,Unknown,Unknown,T,T,T,M2,F,T,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1,W,mastercard,credit,gmail.com,Unknown,Unknown,Unknown,Unknown,M0,T,T,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
2,W,visa,debit,outlook.com,Unknown,T,T,T,M0,F,F,F,F,F,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
3,W,mastercard,debit,yahoo.com,Unknown,Unknown,Unknown,Unknown,M0,T,F,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
4,H,mastercard,credit,gmail.com,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,NotFound,New,NotFound,Unknown,Unknown,New,NotFound,Android 7.0,samsung browser 6.2,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [8]:
objects = pd.get_dummies(objects)

In [9]:
objects['TransactionID'] = merge['TransactionID']
cols = objects.columns.to_list()
cols = cols[-1:] + cols[:-1]
objects = objects[cols]
objects.astype('category')
print(objects.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 0 to 590539
Columns: 2462 entries, TransactionID to DeviceInfo_xs-Z47b7VqTMxs
dtypes: int64(1), uint8(2461)
memory usage: 1.4 GB
None


In [10]:
numbers.fillna(numbers.mean(), inplace=True)

In [11]:
print(numbers.isna().sum())
print(objects.isna().sum())

TransactionID     0
isFraud           0
TransactionDT     0
TransactionAmt    0
card1             0
                 ..
id_22             0
id_24             0
id_25             0
id_26             0
id_32             0
Length: 403, dtype: int64
TransactionID                0
ProductCD_C                  0
ProductCD_H                  0
ProductCD_R                  0
ProductCD_S                  0
                            ..
DeviceInfo_verykools5004     0
DeviceInfo_verykools5034     0
DeviceInfo_verykools5035     0
DeviceInfo_vivo              0
DeviceInfo_xs-Z47b7VqTMxs    0
Length: 2462, dtype: int64


In [12]:
merge_filled = objects.merge(numbers, how='outer', on='TransactionID')
del objects
del numbers
merge_filled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 0 to 590539
Columns: 2864 entries, TransactionID to id_32
dtypes: float64(399), int64(4), uint8(2461)
memory usage: 3.1 GB


In [13]:
X = merge_filled.copy()
y = X.isFraud
X.drop(columns='isFraud', inplace=True)
print(X.info())
print(len(y))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 0 to 590539
Columns: 2863 entries, TransactionID to id_32
dtypes: float64(399), int64(3), uint8(2461)
memory usage: 3.1 GB
None
590540


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=37)

In [15]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(413378, 2863) (413378,) (177162, 2863) (177162,)


In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipe = make_pipeline(LogisticRegression(penalty='l2', C=1e42, max_iter=150, verbose=1, solver='liblinear'))
pipe.fit(X_train, y_train)

[LibLinear]

Pipeline(steps=[('logisticregression',
                 LogisticRegression(C=1e+42, max_iter=150, solver='liblinear',
                                    verbose=1))])

In [17]:
pipe.score(X_test, y_test)

0.965133606529617

In [18]:
y_pred = pipe.predict(X_test)

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    171002
           1       0.00      0.00      0.00      6160

    accuracy                           0.97    177162
   macro avg       0.48      0.50      0.49    177162
weighted avg       0.93      0.97      0.95    177162



##---------------------------------------------------------------------------------------------------------------------------------
## Undersampling

In [13]:
### undersampling

total = merge_filled.shape[0]

counts = merge_filled['isFraud'].value_counts()

counts

0    569877
1     20663
Name: isFraud, dtype: int64

In [14]:
no_fraud = counts[0]
fraud = counts[1]

print(f'No fraud: {(no_fraud/total)*100}% => {no_fraud}')
print(f'Fraud: {(fraud/total)*100}% => {fraud}')


No fraud: 96.50099908558268% => 569877
Fraud: 3.4990009144173126% => 20663


In [15]:
fraud_df = merge_filled[merge_filled['isFraud'] == 1]

no_fraud_df = merge_filled[merge_filled['isFraud'] == 0]

no_fraud_df = no_fraud_df.iloc[:fraud]

no_fraud_df.shape

(20663, 2864)

In [18]:
undersample = pd.concat([no_fraud_df, fraud_df])

total = undersample.shape[0]

counts = undersample['isFraud'].value_counts()

no_fraud = counts[0]
fraud = counts[1]

print(f'No fraud: {(no_fraud/total)*100}% => {no_fraud}')
print(f'Fraud: {(fraud/total)*100}% => {fraud}')

No fraud: 50.0% => 20663
Fraud: 50.0% => 20663


In [21]:
X = undersample.copy()
y = X.isFraud
X.drop(columns='isFraud', inplace=True)
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)
print(X.info())
print(len(y))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41326 entries, 0 to 41325
Columns: 2863 entries, TransactionID to id_32
dtypes: float64(399), int64(3), uint8(2461)
memory usage: 223.7 MB
None
41326


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=37)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(28928, 2863) (28928,) (12398, 2863) (12398,)


In [24]:
print(y_train.value_counts())
print(y_test.value_counts())

1    14492
0    14436
Name: isFraud, dtype: int64
0    6227
1    6171
Name: isFraud, dtype: int64


In [25]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipe = make_pipeline(LogisticRegression(penalty='l2', C=0.001, max_iter=150, verbose=1, solver='liblinear'))
pipe.fit(X_train, y_train)

[LibLinear]

Pipeline(steps=[('logisticregression',
                 LogisticRegression(C=0.001, max_iter=150, solver='liblinear',
                                    verbose=1))])

In [26]:
pipe.score(X_test, y_test)

0.9796741409904823

In [27]:
y_pred = pipe.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6227
           1       1.00      0.96      0.98      6171

    accuracy                           0.98     12398
   macro avg       0.98      0.98      0.98     12398
weighted avg       0.98      0.98      0.98     12398



In [34]:
print(y_test.value_counts())
print(pd.Series(y_pred).value_counts())

0    6227
1    6171
Name: isFraud, dtype: int64
0    6479
1    5919
dtype: int64


##----------------------------------------------------------------------------------------------------------------------------------------------
## Test dataset

In [31]:
# test set preprocessing

idee = pd.read_csv('test_identity.csv')
transaction = pd.read_csv('test_transaction.csv')
merge = transaction.merge(idee, how='outer', on='TransactionID')
print(merge.info())
print(merge.dtypes.value_counts())
objects = merge.select_dtypes('object')
numbers = merge.select_dtypes(include=['float64', 'int64'])
objects.fillna("Unknown", inplace=True)
objects = pd.get_dummies(objects)
objects['TransactionID'] = merge['TransactionID']
cols = objects.columns.to_list()
cols = cols[-1:] + cols[:-1]
objects = objects[cols]
objects.astype('category')
numbers.fillna(numbers.mean(), inplace=True)
merge_filled_test = objects.merge(numbers, how='outer', on='TransactionID')
del objects
del numbers
X_test = merge_filled_test.copy()
print(X_test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506691 entries, 0 to 506690
Columns: 433 entries, TransactionID to DeviceInfo
dtypes: float64(399), int64(3), object(31)
memory usage: 1.6+ GB
None
float64    399
object      31
int64        3
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 506691 entries, 0 to 506690
Columns: 3447 entries, TransactionID to id-32
dtypes: float64(399), int64(3), uint8(3045)
memory usage: 3.0 GB
None


ValueError: X has 3447 features per sample; expecting 2863

In [None]:
sample_sub = pd.read_csv('sample_submission.csv')
sample_sub.head()