# SkopeRules

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve
from skrules import SkopeRules
from preprocessing import *
import matplotlib.pyplot as plt 
#import shap 
import numpy as np

In [20]:
le = LabelEncoder()

def clean_data(filename):

    print("Importing data...")
    data = pd.read_csv(datapath+filename, index_col=0)

    # Separate majority and minority classes
    df_majority = data[data['is_fraud'] == 0]
    df_minority = data[data['is_fraud'] == 1]

    # Downsample majority class to match the minority class size (or a bit more if you like)
    df_majority_downsampled = df_majority.sample(n=10000, random_state=42)

    # Combine minority class with downsampled majority
    df_balanced = pd.concat([df_minority, df_majority_downsampled])

    # Shuffle the dataset
    data = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    print("splitting time columns...")
    data["trans_date_trans_time"] = pd.to_datetime(data["trans_date_trans_time"])

    #data['trans_minute'] = data['trans_date_trans_time'].dt.minute
    #data['trans_hour'] = data['trans_date_trans_time'].dt.hour
    #data['trans_day'] = data['trans_date_trans_time'].dt.day
    data['trans_month'] = data['trans_date_trans_time'].dt.month
    data['trans_year'] = data['trans_date_trans_time'].dt.year
    data['trans_dayofweek'] = data['trans_date_trans_time'].dt.dayofweek

    data["dob"] = pd.to_datetime(data["dob"])
    #data["dob_day"] = data["dob"].dt.day
    data["dob_month"] = data["dob"].dt.month
    data["dob_year"] = data["dob"].dt.year

    print("Dropping columns...")
    data = data.drop(["cc_num","long", "merch_long", "lat", 
                      "merch_lat", "unix_time", 
                      "trans_date_trans_time",
                      "first", "last", "dob","trans_num","zip"
                      ], axis=1)
    
    print("Rounding columns...")
    data[["trans_month", "trans_year", "trans_dayofweek", "dob_month", "dob_day", "city_pop"]] = data[["trans_month", "trans_year", "trans_dayofweek", "dob_year", "dob_month", "city_pop"]].round(decimals=0)

    data["amt"] = data["amt"].round(decimals=2)

    cat_data = data.select_dtypes(include=["object"])

    print(data.columns)

    print("Encoding categorical features...")
    label_encoders = {}
    inverse_mappings = {}

    for col in cat_data.columns:
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le
        inverse_mappings[col] = dict(zip(le.transform(le.classes_), le.classes_))

    print("Rounding categorical columns...")
    data[["merchant", "category", "street", "city", "state", "job"]] = data[["merchant", "category", "street", "city", "state", "job"]].round(decimals=0)

    
    print(data.head())


    return data, inverse_mappings

In [21]:
# Load the dataset 
data, inverse_mappings = clean_data("fraud.csv")  

Importing data...
splitting time columns...
Dropping columns...
Rounding columns...
Index(['merchant', 'category', 'amt', 'gender', 'street', 'city', 'state',
       'city_pop', 'job', 'is_fraud', 'trans_month', 'trans_year',
       'trans_dayofweek', 'dob_month', 'dob_year', 'dob_day'],
      dtype='object')
Encoding categorical features...
Rounding categorical columns...
   merchant  category     amt  gender  street  city  state  city_pop  job  \
0       365         2   70.61       1     264   794     27     14075   56   
1       179        12   68.16       1     742   634     17      1565  124   
2        78         7  133.34       1     939   604      4    381459   43   
3       586         0  614.72       0     534    47     41        63  447   
4       304        11  997.39       1     454   194     43   1263321  460   

   is_fraud  trans_month  trans_year  trans_dayofweek  dob_month  dob_year  \
0         0            7        2019                3       1929      1929   
1    

In [22]:
# Splitting features and target variable
X = data.drop("is_fraud", axis=1)
y = data["is_fraud"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [23]:
# Step 3: Fit the SkopeRules model
skope = SkopeRules(
    feature_names=X_train.columns,
    precision_min=0.5,
    recall_min=0.01,
    n_estimators=30,
    random_state=42
)

print("fitting the model")
skope.fit(X_train, y_train)

fitting the model


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.
`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


In [24]:
#def predict_as_proba(X):
#    preds = skope.predict(X)  # skope.predict returns 0 or 1
#    return np.vstack([1 - preds, preds]).T  # Turn into probabilities

In [25]:
# Pick some explanation baseline 
#X_explain = X_train.sample(100, random_state=42)

# Use shap.Explainer (not KernelExplainer)
#explainer = shap.KernelExplainer(predict_as_proba, X_explain)

# Compute SHAP values
#X_test_sample = X_test.sample(100, random_state=42)
#shap_values = explainer.shap_values(X_test)

#shap.summary_plot(shap_values[1], X_test_sample, plot_type="violin", show=False)
#plt.savefig("shap_summary_violin2.png")
#plt.close()

In [26]:
print("finding rules for is_fraud")
rules = skope.rules_[:10]

print("printing rules")
for rule in rules:
    print(rule)
    print()
    print(20*'=')
    print()

finding rules for is_fraud
printing rules
('category <= 12.5 and amt <= 1435.4800415039062 and amt > 238.31500244140625', (0.976536312849162, 0.7680140597539543, 2))


('category <= 12.5 and amt <= 1435.4800415039062 and amt > 238.1300048828125', (0.9684329199549042, 0.7655971479500892, 2))


('category <= 12.5 and amt <= 1435.4800415039062 and amt > 235.1699981689453', (0.9735901157859534, 0.7611994904609551, 4))


('category <= 12.5 and amt <= 1435.4800415039062 and amt > 235.8499984741211', (0.9668141592920354, 0.7633187772925765, 2))


('category <= 12.5 and amt <= 1416.6099853515625 and amt > 235.8499984741211', (0.9666307857911733, 0.7597292724196277, 2))


('category <= 12.5 and amt <= 1435.4800415039062 and amt > 245.5800018310547', (0.970225730071034, 0.7567390367504212, 4))


('category <= 12.5 and amt <= 1435.4800415039062 and amt > 236.2949981689453', (0.9668540219288098, 0.7572575074516267, 8))


('category <= 12.5 and amt <= 1416.6099853515625 and amt > 235.1699981689453'

In [27]:
# Step 4: Predict and evaluate
y_pred_skope = skope.predict(X_test)

print("SkopeRules Classification Report:")
print(classification_report(y_test, y_pred_skope))

SkopeRules Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.79      0.86      3966
           1       0.81      0.95      0.88      3895

    accuracy                           0.87      7861
   macro avg       0.88      0.87      0.87      7861
weighted avg       0.88      0.87      0.87      7861



In [28]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred_skope)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not Fraud", "Fraud"])
disp.plot(cmap="Oranges")

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x154523af0>