In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import joblib

# Load the data
data = pd.read_csv('dataset.csv')

# Preprocess the data
# Drop columns that are not needed
data = data.drop(['nameOrig', 'nameDest'], axis=1)

# Encode the 'type' column
label_encoder = LabelEncoder()
data['type'] = label_encoder.fit_transform(data['type'])

# Split the data into features and target
X = data.drop(['isFraud'], axis=1)
y = data['isFraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Apply SMOTE to the training set using all CPU cores
smote = SMOTE(random_state=42, n_jobs=-1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Create the DecisionTreeClassifier model with the best parameters
dt_model = DecisionTreeClassifier(max_depth=None, min_samples_leaf=2, min_samples_split=5, random_state=42)

# Fit the model to the resampled training data using all CPU cores
print("Training the model...")
dt_model.fit(X_train_resampled, y_train_resampled)
print("Model training completed.")

# Make predictions
print("Making predictions...")
y_pred = dt_model.predict(X_test)
print("Predictions completed.")

# Calculate metrics
print("Calculating metrics...")
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Metrics calculation completed.")

# Display metrics
print("Confusion Matrix:")
print(conf_matrix)
print("\nMetrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Save the model
joblib.dump(dt_model, 'dt_model_best.joblib')

# Optionally, save the scaler if you need to apply the same transformation to new data
joblib.dump(scaler, 'scaler.joblib')

print("Model and scaler saved.")




Training the model...
Model training completed.
Making predictions...
Predictions completed.
Calculating metrics...
Metrics calculation completed.
Confusion Matrix:
[[1270233     671]
 [     50    1570]]

Metrics:
Accuracy: 0.9994334095074042
Precision: 0.7005800981704596
Recall: 0.9691358024691358
F1-score: 0.8132608132608132
Model and scaler saved.
