# Importing the necessary libraries

In [1]:
import pandas as pd
from datetime import date
from datetime import datetime
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pickle

# About the data

This is a simulated credit card transaction dataset containing legitimate and fraud transactions from the duration 1st Jan 2019 - 31st Dec 2020. It covers credit cards of 1000 customers doing transactions with a pool of 800 merchants.


Source of Simulation
This was generated using Sparkov Data Generation | Github tool created by Brandon Harris. This simulation was run for the duration - 1 Jan 2019 to 31 Dec 2020. The files were combined and converted into a standard format.


Information about the Simulator

I do not own the simulator. I used the one used by Brandon Harris and just to understand how it works, I went through few portions of the code. This is what I understood from what I read:

The simulator has certain pre-defined list of merchants, customers and transaction categories. And then using a python library called "faker", and with the number of customers, merchants that you mention during simulation, an intermediate list is created.

After this, depending on the profile you choose for e.g. "adults 2550 female rural.json" (which means simulation properties of adult females in the age range of 25-50 who are from rural areas), the transactions are created. Say, for this profile, you could check "Sparkov | Github | adults_2550_female_rural.json", there are parameter value ranges defined in terms of min, max transactions per day, distribution of transactions across days of the week and normal distribution properties (mean, standard deviation) for amounts in various categories. Using these measures of distributions, the transactions are generated using faker.

What I did was generate transactions across all profiles and then merged them together to create a more realistic representation of simulated transactions.

Acknowledgements

Brandon Harris for his amazing work in creating this easy-to-use simulation tool for creating fraud transaction datasets.

### Link: https://www.kaggle.com/datasets/kartik2112/fraud-detection

### The columns:

1. `trans_date_trans_time`: The date and time of the transaction, represented as a string in the format "YYYY-MM-DD HH:MM:SS".

2. `cc_num`: The credit card number used for the transaction, represented as a string.

3. `merchant`: The name of the merchant where the transaction took place, represented as a string.

4. `category`: The category of the merchant (e.g., "grocery", "fuel", "dining"), represented as a string.

5. `amt`: The amount of the transaction, represented as a float.

6. `first`: The first name of the cardholder, represented as a string.

7. `last`: The last name of the cardholder, represented as a string.

8. `gender`: The gender of the cardholder, represented as a string ("M" for male, "F" for female).

9. `street`: The street address of the cardholder, represented as a string.

10. `city`: The city where the cardholder lives, represented as a string.

11. `state`: The state where the cardholder lives, represented as a string.

12. `zip`: The ZIP code where the cardholder lives, represented as a string.

13. `lat`: The latitude of the merchant where the transaction took place, represented as a float.

14. `long`: The longitude of the merchant where the transaction took place, represented as a float.

15. `city_pop`: The population of the city where the merchant is located, represented as an integer.

16. `job`: The occupation of the cardholder, represented as a string.

17. `dob`: The date of birth of the cardholder, represented as a string in the format "YYYY-MM-DD".

18. `trans_num`: The unique identifier for the transaction, represented as a string.

19. `unix_time`: The date and time of the transaction, represented as a Unix timestamp (i.e., the number of seconds since January 1, 1970).

20. `merch_lat`: The latitude of the merchant where the transaction took place, represented as a float.

21. `merch_long`: The longitude of the merchant where the transaction took place, represented as a float.

22. `is_fraud`: A binary variable indicating whether the transaction is fraudulent (1) or not (0), represented as an integer.

# Data Preprocessing & Feature Engineering

In [27]:
train_df = pd.read_csv("fraudTrain.csv")
train_df

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.2620,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,1296670,2020-06-21 12:12:08,30263540414123,fraud_Reichel Inc,entertainment,15.56,Erik,Patterson,M,162 Jessica Row Apt. 072,...,37.7175,-112.4777,258,Geoscientist,1961-11-24,440b587732da4dc1a6395aba5fb41669,1371816728,36.841266,-111.690765,0
1296671,1296671,2020-06-21 12:12:19,6011149206456997,fraud_Abernathy and Sons,food_dining,51.70,Jeffrey,White,M,8617 Holmes Terrace Suite 651,...,39.2667,-77.5101,100,"Production assistant, television",1979-12-11,278000d2e0d2277d1de2f890067dcc0a,1371816739,38.906881,-78.246528,0
1296672,1296672,2020-06-21 12:12:32,3514865930894695,fraud_Stiedemann Ltd,food_dining,105.93,Christopher,Castaneda,M,1632 Cohen Drive Suite 639,...,32.9396,-105.8189,899,Naval architect,1967-08-30,483f52fe67fabef353d552c1e662974c,1371816752,33.619513,-105.130529,0
1296673,1296673,2020-06-21 12:13:36,2720012583106919,"fraud_Reinger, Weissnat and Strosin",food_dining,74.90,Joseph,Murray,M,42933 Ryan Underpass,...,43.3526,-102.5411,1126,Volunteer coordinator,1980-08-18,d667cdcbadaaed3da3f4020e83591c83,1371816816,42.788940,-103.241160,0


In [28]:
test_df = pd.read_csv("fraudTest.csv")
test_df

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.4360,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.495810,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.0170,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,555714,2020-12-31 23:59:07,30560609640617,fraud_Reilly and Sons,health_fitness,43.77,Michael,Olson,M,558 Michael Estates,...,40.4931,-91.8912,519,Town planner,1966-02-13,9b1f753c79894c9f4b71f04581835ada,1388534347,39.946837,-91.333331,0
555715,555715,2020-12-31 23:59:09,3556613125071656,fraud_Hoppe-Parisian,kids_pets,111.84,Jose,Vasquez,M,572 Davis Mountains,...,29.0393,-95.4401,28739,Futures trader,1999-12-27,2090647dac2c89a1d86c514c427f5b91,1388534349,29.661049,-96.186633,0
555716,555716,2020-12-31 23:59:15,6011724471098086,fraud_Rau-Robel,kids_pets,86.88,Ann,Lawson,F,144 Evans Islands Apt. 683,...,46.1966,-118.9017,3684,Musician,1981-11-29,6c5b7c8add471975aa0fec023b2e8408,1388534355,46.658340,-119.715054,0
555717,555717,2020-12-31 23:59:24,4079773899158,fraud_Breitenberg LLC,travel,7.99,Eric,Preston,M,7020 Doyle Stream Apt. 951,...,44.6255,-116.4493,129,Cartographer,1965-12-15,14392d723bb7737606b2700ac791b7aa,1388534364,44.470525,-117.080888,0


### Extracting the age of the card holders

In [29]:
def calculate_age(dob):
    
    dob = datetime.strptime(dob, '%Y-%m-%d').date()
    today = date.today()
    age = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
    return age


In [30]:
train_df['age'] = train_df['dob'].apply(calculate_age)

In [31]:
test_df['age'] = test_df['dob'].apply(calculate_age)

### Encoding categorical features

In [32]:
# For training set
le = LabelEncoder()
train_df['category'] = train_df['category'].astype(str)
le.fit(train_df['category'])
train_df['category'] = le.transform(train_df['category'])



# For testing set
le = LabelEncoder()
test_df['category'] = test_df['category'].astype(str)
le.fit(test_df['category'])
test_df['category'] = le.transform(test_df['category'])

In [33]:
# For training set
le = LabelEncoder()
train_df['merchant'] = train_df['merchant'].astype(str)
le.fit(train_df['merchant'])
train_df['merchant'] = le.transform(train_df['merchant'])



# For testing set
le = LabelEncoder()
test_df['merchant'] = test_df['merchant'].astype(str)
le.fit(test_df['merchant'])
test_df['merchant'] = le.transform(test_df['merchant'])

In [34]:
# For training set
le = LabelEncoder()
train_df['gender'] = train_df['gender'].astype(str)
le.fit(train_df['gender'])
train_df['gender'] = le.transform(train_df['gender'])



# For testing set
le = LabelEncoder()
test_df['gender'] = test_df['gender'].astype(str)
le.fit(test_df['gender'])
test_df['gender'] = le.transform(test_df['gender'])

In [35]:
# For training set
le = LabelEncoder()
train_df['city'] = train_df['city'].astype(str)
le.fit(train_df['city'])
train_df['city'] = le.transform(train_df['city'])



# For testing set
le = LabelEncoder()
test_df['city'] = test_df['city'].astype(str)
le.fit(test_df['city'])
test_df['city'] = le.transform(test_df['city'])

In [36]:
# For training set
le = LabelEncoder()
train_df['state'] = train_df['state'].astype(str)
le.fit(train_df['state'])
train_df['state'] = le.transform(train_df['state'])



# For testing set
le = LabelEncoder()
test_df['state'] = test_df['state'].astype(str)
le.fit(test_df['state'])
test_df['state'] = le.transform(test_df['state'])

In [37]:
# For training set
le = LabelEncoder()
train_df['zip'] = train_df['zip'].astype(str)
le.fit(train_df['zip'])
train_df['zip'] = le.transform(train_df['zip'])



# For testing set
le = LabelEncoder()
test_df['zip'] = test_df['zip'].astype(str)
le.fit(test_df['zip'])
test_df['zip'] = le.transform(test_df['zip'])

In [38]:
# For training set
le = LabelEncoder()
train_df['job'] = train_df['job'].astype(str)
le.fit(train_df['job'])
train_df['job'] = le.transform(train_df['job'])



# For testing set
le = LabelEncoder()
test_df['job'] = test_df['job'].astype(str)
le.fit(test_df['job'])
test_df['job'] = le.transform(test_df['job'])

### Standardizing Numerical features

In [39]:
# For training set
sc = StandardScaler()

train_df['amt'] = sc.fit_transform(train_df['amt'].values.reshape(-1, 1))
train_df['age'] = sc.fit_transform(train_df['age'].values.reshape(-1, 1))


# For testing set
sc = StandardScaler()

test_df['amt'] = sc.fit_transform(test_df['amt'].values.reshape(-1, 1))
test_df['age'] = sc.fit_transform(test_df['age'].values.reshape(-1, 1))

# Modeling

### split the training and testing datasets into features and labels

In [42]:
features = ['category', 'merchant', 'gender', 'city', 'state', 'zip', 'job', 'amt', 'age']
label = ['is_fraud']

train_df_sample = train_df.sample(n = 50000, random_state = 42)
test_df_sample = test_df.sample(n = 10000, random_state = 42)

train_features = train_df_sample[features]
train_labels = train_df_sample[label]
test_features = test_df_sample[features]
test_labels = test_df_sample[label]

### Setting up XGBoost Classifier

In [16]:
xgb_clf = xgb.XGBClassifier()

params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 500, 1000]
    }

### Fitting through GridSearchCV for hyperparameter optimization

In [17]:
grid_search = GridSearchCV(xgb_clf, params, cv = 10, scoring = 'accuracy', n_jobs = -1)

grid_search.fit(train_features, train_labels)

### The best hyperparameters found

In [18]:
print('Best Hyperparameters:', grid_search.best_params_)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


### Using the best hyperparameters to train the XGBoost classifier on the entire training set

In [43]:
xgb_clf = xgb.XGBClassifier(**grid_search.best_params_)
xgb_clf.fit(train_features, train_labels)

# Performance Metrics

### Predicting churn likelihood on the testing set (unseen data)

In [44]:
test_preds = xgb_clf.predict(test_features)

### The accuracy of the XGBoost classifier on the testing set

In [45]:
test_accuracy = accuracy_score(test_labels, test_preds)
print(f'Test Accuracy = {test_accuracy:.2f}')

Test Accuracy = 1.00


### The recall of the XGBoost classifier on the testing set

In [46]:
test_recall = recall_score(test_labels, test_preds)
print(f'Test Recall = {test_recall:.2f}')

Test Recall = 0.75


### The precision of the XGBoost classifier on the testing set

In [47]:
test_precision = precision_score(test_labels, test_preds)
print(f'Test Precision = {test_precision:.2f}')

Test Precision = 0.83


### The f1-score of the XGBoost classifier on the testing set

In [48]:
test_f1 = f1_score(test_labels, test_preds)
print(f'Test F1-score = {test_f1:.2f}')

Test F1-score = 0.79


# Summarizing the results

In [50]:
results_df = pd.DataFrame(
    {
        'First Name': test_df_sample['first'],
        'Last Name': test_df_sample['last'],
        'Prediction': test_preds
        
    }
)

results_df

Unnamed: 0,First Name,Last Name,Prediction
119106,Kimberly,Myers,0
179292,Samuel,Sandoval,0
540729,Helen,Campbell,0
374360,Samuel,Sandoval,0
314574,Christie,Williamson,0
...,...,...,...
417956,Stacy,Villegas,0
185944,Mary,Schmidt,0
545868,Jason,Mcmahon,0
162569,Bill,Lane,0


Where, 0 = not default & 1 = default

In [51]:
results_df.to_csv("Results.csv", index = False)

### Saving the model

In [52]:
with open("Best_XGBoost_classifier.pkl", 'wb') as file:

    pickle.dump(grid_search, file)

# Thanks for reading