In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('/content/card_transdata-1 (1).csv')

In [4]:
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1,1,0,0,0
1,10.829943,0.175592,1.294219,1,0,0,0,0
2,5.091079,0.805153,0.427715,1,0,0,1,0
3,2.247564,5.600044,0.362663,1,1,0,1,0
4,44.190936,0.566486,2.222767,1,1,0,1,0


In [5]:
df.shape

(1000000, 8)

In [6]:
#分割x和y
X = df.drop('fraud', axis=1)
y = df['fraud']

#前后分割的train和test
#iloc+数字，y只有1列
X_train = X.iloc[:500000,:]
X_test  = X.iloc[500000:,:]
y_train = y.iloc[:500000]
y_test  = y.iloc[500000:]

In [7]:
# Identify the indices of the categorical features
categorical_features_indices = [X_train.columns.get_loc(col)
                                for col in ['repeat_retailer',"used_chip","used_pin_number","online_order"]]
categorical_features_indices

[3, 4, 5, 6]

In [None]:
# Identify the indices of the categorical features
categorical_features_indices = [X_train.columns.get_loc(col)
                                for col in ['repeat_retailer',"used_chip","used_pin_number","online_order"]]

# Initialize lists to store metrics
training_f1_scores = []
testing_f1_scores = []
gammas = np.arange(0.1, 1.1, 0.1)  # From 0.1 to 1 in increments of 0.1

for gamma in gammas:
    # 控制了要生成的少数类样本相对于多数类样本的数量比例
    #SMOTENC的做法
    smotenc = SMOTENC(categorical_features=categorical_features_indices,
                      sampling_strategy=gamma, random_state=42)
    X_res, y_res = smotenc.fit_resample(X_train, y_train)

    # Fit a decision tree classifier
    clf = DecisionTreeClassifier(criterion='entropy', random_state=67)
    clf.fit(X_res, y_res)

    # Calculate metrics for training set
    y_pred_train = clf.predict(X_res)
    precision_train, recall_train, f1_train, _ = precision_recall_fscore_support(y_res, y_pred_train, average='binary')

    # Calculate metrics for testing set
    y_pred_test = clf.predict(X_test)
    precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test, y_pred_test, average='binary')

    # Store F1 scores
    training_f1_scores.append(f1_train)
    testing_f1_scores.append(f1_test)

# Plotting the F1 scores
plt.figure(figsize=(10, 6))
plt.plot(gammas, training_f1_scores, label='Training F1 Score')
plt.plot(gammas, testing_f1_scores, label='Testing F1 Score')
plt.xlabel('Gamma')
plt.ylabel('F1 Score')
plt.title('F1 Score vs. Gamma')
plt.legend()
plt.show()

From the graph, it appears that:

The training F1 score is consistently high across all values of γ, which is typical as the model is directly trained on this data.
The testing F1 score fluctuates as γ changes. The fact that the testing score varies suggests that the model's ability to generalize to unseen data is affected by the level of oversampling.
The performance of the decision tree on the testing set, as measured by the F1 score, doesn't show a clear improvement with increasing
γ. The F1 score decreases initially and then shows some volatility with various peaks and troughs. This could be indicative of overfitting at certain levels of oversampling, where the decision tree becomes too tailored to the oversampled training data and performs worse on the testing set.

In [None]:
X = df.drop('fraud', axis=1)
y = df['fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.5, random_state=42)

clf = DecisionTreeClassifier(criterion='entropy',
                             random_state=67)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
precision_train, recall_train, f1_train, _ = precision_recall_fscore_support(y_train, y_pred_train, average='binary')

y_pred_test = clf.predict(X_test)
precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test, y_pred_test, average='binary')

print("F1 score on training set without SMOTE:",f1_train)
print("F1 score on testing set without SMOTE:",f1_test)

F1 score on training set without SMOTE: 1.0
F1 score on testing set without SMOTE: 0.9998397252432742


Comparing to the original model, we find that SMOTE did not improve the performance of our model performance. When gamma goes up, SMOTE model works even worse than the original model in this example.