In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
path = '../data/data.csv'
df = pd.read_csv(path, sep=';').rename(columns={
    'CreditCardType': 'CARD_TYPE', 
    'CreditCardFundingSourceName': 'CARD_NAME',
    'PaymentCreationType': 'CREATION_TYPE',
    'MerchantGrouping': 'MERCHANT_GROUP',
    'AcquirerProperty': 'ACQUIRER_PROPERTY',
    'MCC': 'MCC',
    'BIN#': 'BIN',
    'IssuerName': 'ISSUER',
    'Issuer Country': 'COUNTRY',
    'RequestedScaChallengeIndicator': 'SCA_INDICATOR',
    'ScaExemption': 'SCA_EXEMPTION',
    'ScaExemptionFlow': 'SCA_EXEMPTION_FLOW',
    'ScaPolicy': 'SCA_POLICY',
    'Partner': 'PARTNER',
    'Merchant': 'MERCHANT',
    'Shop': 'SHOP',
    'PaymentProvider': 'PAYMENT_PROVIDER',
    'Payment Currency Code': 'CURRENCY',
    'PaymentID': 'PAYMENT_ID',
    'Settled Pmt Amt': 'SETTLED_PAYMENT_AMOUNT',
    'Settled Base Amt': 'AMOUNT',
    'FN Qty': 'FRAUD'
    })
df.head()

In [4]:
df = df[['CARD_TYPE', 'CARD_NAME', 'CREATION_TYPE', 'MCC', 'COUNTRY', 'SCA_EXEMPTION', 'SCA_EXEMPTION_FLOW', 'MERCHANT', 'SHOP', 'AMOUNT', 'FRAUD']]

**Dealing with NULL values.**

- Only 4 rows with NULL currency, drop these columns
- SCA has a lot of NULLs, fill them with UNKNOWN

In [None]:
df['FRAUD'] = df['FRAUD'].fillna(0).astype(int)
df['AMOUNT'] = df['AMOUNT'].apply(lambda x: float(str(x).replace(',', '.')))
df[['SCA_EXEMPTION', 'SCA_EXEMPTION_FLOW']] = df[['SCA_EXEMPTION', 'SCA_EXEMPTION_FLOW']].fillna('Unkown')
df.isna().sum()

In [None]:
df.head()

In [7]:
def encode_top_k(df, column_name, k):
    top_k = df[column_name].value_counts().nlargest(k).index
    top_k_mapping = {category: f'CLASS_{idx}' for idx, category in enumerate(top_k, start=1)}
    top_k_mapping['Other'] = 'CLASS_0'
    
    df[column_name] = df[column_name].apply(lambda x: top_k_mapping[x] if x in top_k_mapping else top_k_mapping['Other'])
    df[column_name] = df[column_name].astype('category')
    
    df = pd.get_dummies(df, columns=[column_name], drop_first=True)
    # df.drop(columns=[column_name])
        
    return df

In [None]:
df = encode_top_k(df, 'CARD_TYPE', 4)
df = encode_top_k(df, 'CARD_NAME', 4)
df = encode_top_k(df, 'CREATION_TYPE', 4)
df = encode_top_k(df, 'MCC', 9)
df = encode_top_k(df, 'COUNTRY', 9)
df = encode_top_k(df, 'SCA_EXEMPTION', 4)
df = encode_top_k(df, 'SCA_EXEMPTION_FLOW', 3)
df = encode_top_k(df, 'MERCHANT', 9)
df = encode_top_k(df, 'SHOP', 9)

df.head()

In [None]:
df.describe()

In [None]:
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


X = df.drop(columns=['FRAUD'])
y = df['FRAUD']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

# smote_tomek = SMOTETomek(random_state=42)
# X_pca, y = smote_tomek.fit_resample(X_pca, y)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

logreg = LogisticRegression()

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")

plt.figure(figsize=(8, 6))
scatter0 = plt.scatter(X_pca[y == 0, 0], X_pca[y == 0, 1], c='lightblue', alpha=0.4, label='Class 0')
scatter1 = plt.scatter(X_pca[y == 1, 0], X_pca[y == 1, 1], c='darkblue', alpha=0.4, label='Class 1')

plt.title('PCA of Fraud Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()
