In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import NearestNeighbors
import joblib  # Added joblib library for serialization

# Load the dataset (assuming it's in CSV format)
# Replace 'your_dataset.csv' with the actual file path or URL
data = pd.read_csv('PS_20174392719_1491204439457_log.csv')

# Assume the dataset has features (X) and labels (y)
feature_columns = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
label_column = 'isFraud'

# Use a smaller subset of the data for faster prototyping
data_subset = data.sample(frac=0.05, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data_subset[feature_columns],
    data_subset[label_column],
    test_size=0.2,
    random_state=42
)

# Create a nearest neighbors estimator with n_jobs set
nearest_neighbors = NearestNeighbors(n_neighbors=5, n_jobs=-1)

# Handling imbalanced classes using SMOTE and RandomUnderSampler
smote = SMOTE(sampling_strategy=0.5, random_state=42, k_neighbors=nearest_neighbors)
under_sampler = RandomUnderSampler(sampling_strategy=0.8, random_state=42)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
X_resampled, y_resampled = under_sampler.fit_resample(X_resampled, y_resampled)

# Create an XGBoost classifier with simplified hyperparameters
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1,
    n_estimators=100,  # Reduced number of estimators
    max_depth=5,       # Simplified max_depth
    subsample=1.0,     # Simplified subsample
    colsample_bytree=1.0  # Simplified colsample_bytree
)

# Train the model
xgb_model.fit(X_resampled, y_resampled)

# Serialize and save the trained model
joblib.dump(xgb_model, 'xgb_model.pkl')

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the results
print(f'Accuracy: {accuracy:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)


Accuracy: 0.9934
Confusion Matrix:
[[63134   413]
 [    5    75]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     63547
           1       0.15      0.94      0.26        80

    accuracy                           0.99     63627
   macro avg       0.58      0.97      0.63     63627
weighted avg       1.00      0.99      1.00     63627

