In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
path = '../data/data_2.csv'
df = pd.read_csv(path, sep=';').rename(columns={
    'CreditCardType': 'CARD_TYPE', 
    'CreditCardFundingSourceName': 'CARD_NAME',
    'PaymentCreationType': 'CREATION_TYPE',
    'MerchantGrouping': 'MERCHANT_GROUP',
    'AcquirerProperty': 'ACQUIRER_PROPERTY',
    'MCC': 'MCC',
    'BIN#': 'BIN',
    'IssuerName': 'ISSUER',
    'Issuer Country': 'COUNTRY',
    'RequestedScaChallengeIndicator': 'SCA_INDICATOR',
    'ScaExemption': 'SCA_EXEMPTION',
    'ScaExemptionFlow': 'SCA_EXEMPTION_FLOW',
    'ScaPolicy': 'SCA_POLICY',
    'Partner': 'PARTNER',
    'Merchant': 'MERCHANT',
    'Shop': 'SHOP',
    'PaymentProvider': 'PAYMENT_PROVIDER',
    'Payment Currency Code': 'CURRENCY',
    'PaymentID': 'PAYMENT_ID',
    'Settled Pmt Amt': 'SETTLED_PAYMENT_AMOUNT',
    'Settled Base Amt': 'AMOUNT',
    'FN Qty': 'FRAUD'
    })
df.head()

Unnamed: 0,Date,CARD_TYPE,CARD_NAME,CREATION_TYPE,MERCHANT_GROUP,ACQUIRER_PROPERTY,MCC,BIN,ISSUER,COUNTRY,...,SCA_POLICY,PARTNER,MERCHANT,SHOP,PAYMENT_PROVIDER,CURRENCY,PAYMENT_ID,SETTLED_PAYMENT_AMOUNT,AMOUNT,FRAUD
0,1/2/2025,Diners,Credit,MerchantInitiatedWithStoredAccount,Partner,DaoPay,5967,369574,DC JAPAN,Japan,...,,DaoPay_PXP_Direct,Technius Ltd,www.stripchat.com,CQRUK,USD,1860738850,2099,203371766,
1,1/2/2025,Diners,Credit,MerchantInitiatedWithStoredAccount,Partner,DaoPay,5967,369574,DC JAPAN,Japan,...,,DaoPay_PXP_Direct,Technius Ltd,www.stripchat.com,CQRUK,USD,1860845932,1999,193682783,
2,1/2/2025,Diners,Credit,UserInitiatedWithStoredAccount,Partner,DaoPay,5967,361011,DC HDFC BANK,India,...,Default Policy,DaoPay_PXP_Direct,Technius Ltd,www.stripchat.com,CQRUK,USD,1860542177,999,96792946,
3,1/2/2025,Diners,Credit,UserInitiatedWithStoredAccount,Partner,DaoPay,5967,361135,DC HDFC BANK,India,...,Default Policy,DaoPay_PXP_Direct,Technius Ltd,www.stripchat.com,CQRUK,USD,1860547609,499,48348028,
4,1/2/2025,Discover,Credit,MerchantInitiatedWithStoredAccount,Partner,DaoPay,5967,601100,DISCOVER ISSUER,United States,...,,DaoPay_PXP_Direct,Technius Ltd,www.stripchat.com,CQRUK,USD,1860509109,4999,484352291,


In [13]:
df = df[['CARD_TYPE', 'CARD_NAME', 'CREATION_TYPE', 'MCC', 'COUNTRY', 'SCA_EXEMPTION', 'SCA_EXEMPTION_FLOW', 'MERCHANT', 'SHOP', 'AMOUNT', 'FRAUD']]

**Dealing with NULL values.**

- Only 4 rows with NULL currency, drop these columns
- SCA has a lot of NULLs, fill them with UNKNOWN

In [14]:
df['FRAUD'] = df['FRAUD'].fillna(0).astype(int)
df['AMOUNT'] = df['AMOUNT'].apply(lambda x: float(str(x).replace(',', '.')))
df = df.fillna('Unkown')
df.isna().sum()

CARD_TYPE             0
CARD_NAME             0
CREATION_TYPE         0
MCC                   0
COUNTRY               0
SCA_EXEMPTION         0
SCA_EXEMPTION_FLOW    0
MERCHANT              0
SHOP                  0
AMOUNT                0
FRAUD                 0
dtype: int64

In [15]:
df.head()

Unnamed: 0,CARD_TYPE,CARD_NAME,CREATION_TYPE,MCC,COUNTRY,SCA_EXEMPTION,SCA_EXEMPTION_FLOW,MERCHANT,SHOP,AMOUNT,FRAUD
0,Diners,Credit,MerchantInitiatedWithStoredAccount,5967,Japan,Unkown,Unkown,Technius Ltd,www.stripchat.com,20.337177,0
1,Diners,Credit,MerchantInitiatedWithStoredAccount,5967,Japan,Unkown,Unkown,Technius Ltd,www.stripchat.com,19.368278,0
2,Diners,Credit,UserInitiatedWithStoredAccount,5967,India,Unkown,Unkown,Technius Ltd,www.stripchat.com,9.679295,0
3,Diners,Credit,UserInitiatedWithStoredAccount,5967,India,Unkown,Unkown,Technius Ltd,www.stripchat.com,4.834803,0
4,Discover,Credit,MerchantInitiatedWithStoredAccount,5967,United States,Unkown,Unkown,Technius Ltd,www.stripchat.com,48.435229,0


In [16]:
def encode_top_k(df, column_name, k):
    top_k = df[column_name].value_counts().nlargest(k).index
    top_k_mapping = {category: f'CLASS_{idx}' for idx, category in enumerate(top_k, start=1)}
    top_k_mapping['Other'] = 'CLASS_0'
    
    df[column_name] = df[column_name].apply(lambda x: top_k_mapping[x] if x in top_k_mapping else top_k_mapping['Other'])
    df[column_name] = df[column_name].astype('category')
    
    df = pd.get_dummies(df, columns=[column_name], drop_first=True)
    # df.drop(columns=[column_name])
        
    return df

In [17]:
df = encode_top_k(df, 'CARD_TYPE', 4)
df = encode_top_k(df, 'CARD_NAME', 4)
df = encode_top_k(df, 'CREATION_TYPE', 4)
df = encode_top_k(df, 'MCC', 9)
df = encode_top_k(df, 'COUNTRY', 9)
df = encode_top_k(df, 'SCA_EXEMPTION', 4)
df = encode_top_k(df, 'SCA_EXEMPTION_FLOW', 3)
df = encode_top_k(df, 'MERCHANT', 9)
df = encode_top_k(df, 'SHOP', 9)

df.head()

Unnamed: 0,AMOUNT,CARD_TYPE_CLASS_1,CARD_TYPE_CLASS_2,CARD_TYPE_CLASS_3,CARD_TYPE_CLASS_4,CARD_NAME_CLASS_1,CARD_NAME_CLASS_2,CARD_NAME_CLASS_3,CARD_NAME_CLASS_4,CREATION_TYPE_CLASS_1,...,SHOP_CLASS_1,SHOP_CLASS_2,SHOP_CLASS_3,SHOP_CLASS_4,SHOP_CLASS_5,SHOP_CLASS_6,SHOP_CLASS_7,SHOP_CLASS_8,SHOP_CLASS_9,FRAUD_CLASS_1
0,20.337177,False,False,False,True,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,True
1,19.368278,False,False,False,True,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,True
2,9.679295,False,False,False,True,False,True,False,False,True,...,False,False,True,False,False,False,False,False,False,True
3,4.834803,False,False,False,True,False,True,False,False,True,...,False,False,True,False,False,False,False,False,False,True
4,48.435229,False,False,True,False,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,True


In [10]:
df.describe()

Unnamed: 0,AMOUNT,FRAUD
count,32685.0,32685.0
mean,29.013528,0.006517
std,145.294123,0.081221
min,0.01,0.0
25%,6.718173,0.0
50%,12.031088,0.0
75%,24.062177,0.0
max,12728.89146,2.0


In [None]:
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


X = df.drop(columns=['FRAUD'])
y = df['FRAUD']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

# smote_tomek = SMOTETomek(random_state=42)
# X_pca, y = smote_tomek.fit_resample(X_pca, y)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

logreg = LogisticRegression()

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")

plt.figure(figsize=(8, 6))
scatter0 = plt.scatter(X_pca[y == 0, 0], X_pca[y == 0, 1], c='lightblue', alpha=0.4, label='Class 0')
scatter1 = plt.scatter(X_pca[y == 1, 0], X_pca[y == 1, 1], c='darkblue', alpha=0.4, label='Class 1')

plt.title('PCA of Fraud Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()
