In [1]:
# Let's start by loading the dataset to understand its structure and content
import pandas as pd

# Load the dataset
file_path = 'credit_card_fraud.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset to get an overview
df.head()

Unnamed: 0,Transaction Date and Time,Transaction Amount,Cardholder Name,Card Number (Hashed or Encrypted),Merchant Name,Merchant Category Code (MCC),Transaction Location (City or ZIP Code),Transaction Currency,Card Type,Card Expiration Date,CVV Code (Hashed or Encrypted),Transaction Response Code,Transaction ID,Fraud Flag or Label,Previous Transactions,Transaction Source,IP Address,Device Information,User Account Information,Transaction Notes
0,2022-09-24 13:54:27,285.88,Shray Soman,daca51bffe0fc4eaaa7c430917f94b75fd893712492254...,"Rajagopalan, Ghose and Kant",3590,Khammam,INR,MasterCard,04/29,01299ac65733b5a3d774265fbfe8396b8611e5e3321855...,5,b7f69cbc-a03d-41f8-adca-75920b0242c3,1,,Online,18.106.240.6,Tablet,,Consequatur corporis minima ad vero deserunt n...
1,2020-07-24 11:20:13,1777.32,Lakshit Bakshi,d5366dd9be3a0266c0252baceaa6332210ed4dbd0ef252...,Sule PLC,7277,Vasai-Virar,EUR,American Express,03/26,ad8fa913d25b3970c6efa8ca504da8ba670ce2a9cf012d...,12,ad53cc8e-8412-422e-8cad-4176daac8387,1,,Online,212.48.185.128,Mobile,jloyal,Dolore repellendus odio deleniti. Eaque quibus...
2,2023-03-18 01:05:36,3939.01,Riya Bobal,4737a6384aa1c3c31f7768b86633d6a8401dd6ab4a8ff1...,Badal PLC,9297,Nangloi Jat,USD,Visa,11/29,eb3be230bbd2844b1f5d8f2e4fab9ffba8ab22cfeeb69c...,5,f09cecd6-0d74-4551-8089-a31d64e6b9c5,1,3 or more,In-Person,17.190.112.46,Mobile,,Unde beatae perspiciatis sapiente. Voluptates ...
3,2021-01-07 21:53:04,376.44,Mohanlal Balakrishnan,4721806eed8f2663bb597ff13e79a294de318358fbb54b...,Konda-Sodhi,5686,Ramagundam,USD,Visa,09/25,891d46993a36d78392247c642138cede01d9841daab1d9...,5,b208ae0b-4c4f-428f-b6b9-5360b288b947,0,2,In-Person,153.136.24.104,Tablet,,Cupiditate repellendus necessitatibus quo occa...
4,2021-12-16 06:22:24,1687.33,Mannat Rout,9deacc3a9efd6e382826d400620aca5f23ed94327578e5...,Dua Ltd,2940,Adoni,INR,MasterCard,03/31,48a1a756f2d83f1dc57bbf14052b70a6f40d0fceed6662...,5,6b4e4e43-5b73-4906-9973-299a1b2a5e71,1,2,Online,196.153.28.131,Desktop,,Molestias assumenda consectetur itaque veritat...


In [2]:
df.shape

(8000, 20)

In [3]:
# Start data cleaning process based on the initial inspection

# 1. Handle missing values
# For simplicity, fill missing string-based columns with 'Unknown' and leave numerical as is for now
string_columns = df.select_dtypes(include='object').columns
df[string_columns] = df[string_columns].fillna('Unknown')

In [4]:
# 2. Remove duplicates
df = df.drop_duplicates()


In [5]:
# 3. Convert data types
# Convert 'Transaction Date and Time' to datetime
df['Transaction Date and Time'] = pd.to_datetime(df['Transaction Date and Time'])
# Since the dataset doesn't have obvious numerical conversions needed, we skip this step



In [6]:
# 4. Check for outliers in 'Transaction Amount' using a simple IQR method
Q1 = df['Transaction Amount'].quantile(0.25)
Q3 = df['Transaction Amount'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Considering it's transaction data, we may not want to outright remove outliers. This step is to just understand the range.
outliers_info = df[(df['Transaction Amount'] < lower_bound) | (df['Transaction Amount'] > upper_bound)]

# Display info about outliers and the first few rows of the cleaned data to verify changes
outliers_info.shape[0]


0

In [7]:
df.head()

Unnamed: 0,Transaction Date and Time,Transaction Amount,Cardholder Name,Card Number (Hashed or Encrypted),Merchant Name,Merchant Category Code (MCC),Transaction Location (City or ZIP Code),Transaction Currency,Card Type,Card Expiration Date,CVV Code (Hashed or Encrypted),Transaction Response Code,Transaction ID,Fraud Flag or Label,Previous Transactions,Transaction Source,IP Address,Device Information,User Account Information,Transaction Notes
0,2022-09-24 13:54:27,285.88,Shray Soman,daca51bffe0fc4eaaa7c430917f94b75fd893712492254...,"Rajagopalan, Ghose and Kant",3590,Khammam,INR,MasterCard,04/29,01299ac65733b5a3d774265fbfe8396b8611e5e3321855...,5,b7f69cbc-a03d-41f8-adca-75920b0242c3,1,Unknown,Online,18.106.240.6,Tablet,Unknown,Consequatur corporis minima ad vero deserunt n...
1,2020-07-24 11:20:13,1777.32,Lakshit Bakshi,d5366dd9be3a0266c0252baceaa6332210ed4dbd0ef252...,Sule PLC,7277,Vasai-Virar,EUR,American Express,03/26,ad8fa913d25b3970c6efa8ca504da8ba670ce2a9cf012d...,12,ad53cc8e-8412-422e-8cad-4176daac8387,1,Unknown,Online,212.48.185.128,Mobile,jloyal,Dolore repellendus odio deleniti. Eaque quibus...
2,2023-03-18 01:05:36,3939.01,Riya Bobal,4737a6384aa1c3c31f7768b86633d6a8401dd6ab4a8ff1...,Badal PLC,9297,Nangloi Jat,USD,Visa,11/29,eb3be230bbd2844b1f5d8f2e4fab9ffba8ab22cfeeb69c...,5,f09cecd6-0d74-4551-8089-a31d64e6b9c5,1,3 or more,In-Person,17.190.112.46,Mobile,Unknown,Unde beatae perspiciatis sapiente. Voluptates ...
3,2021-01-07 21:53:04,376.44,Mohanlal Balakrishnan,4721806eed8f2663bb597ff13e79a294de318358fbb54b...,Konda-Sodhi,5686,Ramagundam,USD,Visa,09/25,891d46993a36d78392247c642138cede01d9841daab1d9...,5,b208ae0b-4c4f-428f-b6b9-5360b288b947,0,2,In-Person,153.136.24.104,Tablet,Unknown,Cupiditate repellendus necessitatibus quo occa...
4,2021-12-16 06:22:24,1687.33,Mannat Rout,9deacc3a9efd6e382826d400620aca5f23ed94327578e5...,Dua Ltd,2940,Adoni,INR,MasterCard,03/31,48a1a756f2d83f1dc57bbf14052b70a6f40d0fceed6662...,5,6b4e4e43-5b73-4906-9973-299a1b2a5e71,1,2,Online,196.153.28.131,Desktop,Unknown,Molestias assumenda consectetur itaque veritat...


In [8]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming 'df' is your DataFrame
# Convert 'Transaction Date and Time' to datetime and create new time-based features
df['Transaction Date and Time'] = pd.to_datetime(df['Transaction Date and Time'])
df['Transaction Hour'] = df['Transaction Date and Time'].dt.hour
df['Transaction Day of Week'] = df['Transaction Date and Time'].dt.dayofweek
df.drop('Transaction Date and Time', axis=1, inplace=True)

# Define your feature columns and target variable
features = [
    "Transaction Amount", "Transaction Hour", "Transaction Day of Week",
    "Cardholder Name", "Merchant Category Code (MCC)",
    "Transaction Location (City or ZIP Code)", "Transaction Currency",
    "Card Type", "Previous Transactions", "Transaction Source",
    "IP Address", "Device Information"
]
target = "Fraud Flag or Label"

# Encode categorical features
categorical_features = [
    "Cardholder Name", "Merchant Category Code (MCC)",
    "Transaction Location (City or ZIP Code)", "Transaction Currency",
    "Card Type", "Previous Transactions", "Transaction Source",
    "IP Address", "Device Information"
]
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
scaler = StandardScaler()
df['Transaction Amount'] = scaler.fit_transform(df[['Transaction Amount']])

# Split the dataset
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.preprocessing import StandardScaler

# Scale all features in X_train and X_test
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
from sklearn.decomposition import PCA

# Initialize PCA - Let's start by keeping 95% of the variance
pca = PCA(n_components=0.95, random_state=42)

# Fit PCA on the training data and transform both training and testing data
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Let's see how many components PCA chose to keep
print(f"PCA selected {pca.n_components_} components.")


PCA selected 12 components.


In [11]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Assuming 'df' is your DataFrame after initial preprocessing

# Define feature columns and target variable
features = [
    "Transaction Amount", "Transaction Hour", "Transaction Day of Week",
    "Cardholder Name", "Merchant Category Code (MCC)",
    "Transaction Location (City or ZIP Code)", "Transaction Currency",
    "Card Type", "Previous Transactions", "Transaction Source",
    "IP Address", "Device Information"
]
target = "Fraud Flag or Label"

# Encode categorical features and scale numerical features
for feature in features:
    if df[feature].dtype == 'object':  # Adjust condition based on your dataset
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])

# Preprocess data
X = df[features]
y = df[target]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance in the training set
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

# Hyperparameter tuning with cross-validation
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 'scale'],
    'kernel': ['rbf']
}

# Initialize GridSearchCV with Stratified K-Folds to maintain class distribution
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=cv, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train_balanced, y_train_balanced)

# Best parameter combination
print("Best parameters:", grid_search.best_params_)

# Best estimator
best_model = grid_search.best_estimator_

# Evaluate the best model on the scaled test set
predictions = best_model.predict(X_test_scaled)
print("Classification Report (Best SVM Model):")
print(classification_report(y_test, predictions))
print("Accuracy Score (Best SVM Model):", accuracy_score(y_test, predictions))


Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END ....................C=0.01, gamma=0.001, kernel=rbf; total time=  12.6s
[CV] END ....................C=0.01, gamma=0.001, kernel=rbf; total time=  13.8s
[CV] END ....................C=0.01, gamma=0.001, kernel=rbf; total time=  10.7s
[CV] END ....................C=0.01, gamma=0.001, kernel=rbf; total time=  11.4s
[CV] END ....................C=0.01, gamma=0.001, kernel=rbf; total time=   6.0s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=   6.3s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=   6.8s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=   7.4s
[CV] END .....................C=0.01, gamma=0.01, kernel=rbf; total time=   6.9s
[CV] END ......................C=0.01, gamma=0.1, kernel=rbf; total time=   6.3s
[CV] END ......................C=0.01, gamma=0.1, kernel=rbf; total time=   5.4s
[CV] END .....................C=0.01, gamma=0.0

In [12]:
import joblib

# Save the model as a .sav file
joblib.dump(best_model, 'best_model.sav')


['best_model.sav']

In [10]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import pandas as pd
import numpy as np

# Assuming 'df' is your DataFrame after initial preprocessing

# Define feature columns and target variable
features = [
    "Transaction Amount", "Transaction Hour", "Transaction Day of Week",
    "Cardholder Name", "Merchant Category Code (MCC)",
    "Transaction Location (City or ZIP Code)", "Transaction Currency",
    "Card Type", "Previous Transactions", "Transaction Source",
    "IP Address", "Device Information"
]
target = "Fraud Flag or Label"

# Encode categorical features and scale numerical features
for feature in features:
    if df[feature].dtype == 'object':  # Adjust condition based on your dataset
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])

# Preprocess data
X = df[features]
y = df[target]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance in the training set
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

# Initialize XGBoost classifier
xgb_model = XGBClassifier(random_state=42)

# Fit the XGBoost model
xgb_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
predictions_xgb = xgb_model.predict(X_test_scaled)

# Evaluate the XGBoost model
print("Classification Report (XGBoost Model):")
print(classification_report(y_test, predictions_xgb))
print("Accuracy Score (XGBoost Model):", accuracy_score(y_test, predictions_xgb))


Classification Report (XGBoost Model):
              precision    recall  f1-score   support

           0       0.51      0.53      0.52       793
           1       0.52      0.50      0.51       807

    accuracy                           0.51      1600
   macro avg       0.51      0.51      0.51      1600
weighted avg       0.51      0.51      0.51      1600

Accuracy Score (XGBoost Model): 0.514375


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.3s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.3s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.3s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.3s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.2s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.3s
[CV] END

In [15]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import pandas as pd
import numpy as np

# Assuming 'df' is your DataFrame after initial preprocessing

# Define feature columns and target variable
features = [
    "Transaction Amount", "Transaction Hour", "Transaction Day of Week",
    "Cardholder Name", "Merchant Category Code (MCC)",
    "Transaction Location (City or ZIP Code)", "Transaction Currency",
    "Card Type", "Previous Transactions", "Transaction Source",
    "IP Address", "Device Information"
]
target = "Fraud Flag or Label"

# Encode categorical features and scale numerical features
for feature in features:
    if df[feature].dtype == 'object':  # Adjust condition based on your dataset
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])

# Preprocess data
X = df[features]
y = df[target]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance in the training set
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

# Initialize XGBoost classifier
xgb_model = XGBClassifier(random_state=42)

# Define hyperparameters for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
}

# Initialize GridSearchCV without specifying cv (default 5-fold)
grid_search = GridSearchCV(xgb_model, param_grid, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train_balanced, y_train_balanced)

# Best parameter combination
print("Best parameters:", grid_search.best_params_)

# Best estimator
best_xgb_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
predictions_xgb_tuned = best_xgb_model.predict(X_test_scaled)

# Evaluate the best XGBoost model
print("Classification Report (Best XGBoost Model):")
print(classification_report(y_test, predictions_xgb_tuned))
print("Accuracy Score (Best XGBoost Model):", accuracy_score(y_test, predictions_xgb_tuned))


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.4s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.4s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.5s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.5s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.5; total time=   0.3s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.3s
[CV] END