In [12]:
# First step: Data cleaning and processing

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.10-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.10-cp312-cp312-manylinux2014_x86_64.whl (97.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.1/97.1 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.10


In [2]:
import pandas as pd
import numpy as np

In [4]:
# Change filename accordingly
df = pd.read_csv("/content/fraudTest.csv")

print("Initial Shape:", df.shape)
df.head()

Initial Shape: (54449, 23)


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371817000.0,33.986391,-81.200714,0.0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371817000.0,39.450498,-109.960431,0.0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371817000.0,40.49581,-74.196111,0.0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371817000.0,28.812398,-80.883061,0.0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371817000.0,44.959148,-85.884734,0.0


In [5]:
# Check missing values
print("\nMissing Values:\n")
print(df.isnull().sum())

# Remove duplicates
df.drop_duplicates(inplace=True)

print("\nShape after removing duplicates:", df.shape)

# Check class imbalance
print("\nFraud Distribution:\n")
print(df['is_fraud'].value_counts(normalize=True))


Missing Values:

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                1
merch_lat                1
merch_long               1
is_fraud                 1
dtype: int64

Shape after removing duplicates: (54449, 23)

Fraud Distribution:

is_fraud
0.0    0.996051
1.0    0.003949
Name: proportion, dtype: float64


In [6]:
columns_to_drop = [
    'Unnamed: 0',
    'cc_num',
    'first',
    'last',
    'street',
    'trans_num'
]

df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

print("Shape after dropping unnecessary columns:", df.shape)

Shape after dropping unnecessary columns: (54449, 17)


In [7]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

df['hour'] = df['trans_date_trans_time'].dt.hour
df['day'] = df['trans_date_trans_time'].dt.day
df['month'] = df['trans_date_trans_time'].dt.month
df['weekday'] = df['trans_date_trans_time'].dt.weekday
df['is_weekend'] = df['weekday'].apply(lambda x: 1 if x >= 5 else 0)

df.drop(columns=['trans_date_trans_time', 'unix_time'], inplace=True, errors='ignore')

In [8]:
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = (pd.Timestamp.now() - df['dob']).dt.days // 365

df.drop(columns=['dob'], inplace=True)

In [9]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km

    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

    return R * c

df['distance'] = haversine(
    df['lat'], df['long'],
    df['merch_lat'], df['merch_long']
)

In [10]:
print("\nFinal Shape:", df.shape)
print("\nFinal Missing Values:\n")
print(df.isnull().sum())

df.head()


Final Shape: (54449, 21)

Final Missing Values:

merchant      0
category      0
amt           0
gender        0
city          0
state         0
zip           0
lat           0
long          0
city_pop      0
job           0
merch_lat     1
merch_long    1
is_fraud      1
hour          0
day           0
month         0
weekday       0
is_weekend    0
age           0
distance      1
dtype: int64


Unnamed: 0,merchant,category,amt,gender,city,state,zip,lat,long,city_pop,...,merch_lat,merch_long,is_fraud,hour,day,month,weekday,is_weekend,age,distance
0,fraud_Kirlin and Sons,personal_care,2.86,M,Columbia,SC,29209,33.9659,-80.9355,333497,...,33.986391,-81.200714,0.0,12,21,6,6,1,57,24.561462
1,fraud_Sporer-Keebler,personal_care,29.84,F,Altonah,UT,84002,40.3207,-110.436,302,...,39.450498,-109.960431,0.0,12,21,6,6,1,36,104.925092
2,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,F,Bellmore,NY,11710,40.6729,-73.5365,34496,...,40.49581,-74.196111,0.0,12,21,6,6,1,55,59.080078
3,fraud_Haley Group,misc_pos,60.05,M,Titusville,FL,32780,28.5697,-80.8191,54767,...,28.812398,-80.883061,0.0,12,21,6,6,1,38,27.698567
4,fraud_Johnston-Casper,travel,3.19,M,Falmouth,MI,49632,44.2529,-85.017,1126,...,44.959148,-85.884734,0.0,12,21,6,6,1,70,104.335106


In [11]:
# Now, Modeling process

In [13]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [14]:
X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]

In [15]:
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print("Categorical Features:")
print(categorical_features)

Categorical Features:
['merchant', 'category', 'gender', 'city', 'state', 'job']


In [18]:
y.dropna(inplace=True)

X = X.loc[y.index]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)

Train Shape: (43558, 20)
Test Shape: (10890, 20)


In [19]:
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print("Scale Pos Weight:", scale_pos_weight)

Scale Pos Weight: 252.24418604651163


In [20]:
model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    loss_function='Logloss',
    eval_metric='AUC',
    scale_pos_weight=scale_pos_weight,
    random_seed=42,
    verbose=100
)

In [21]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features,
    eval_set=(X_test, y_test),
    verbose=100
)

0:	test: 0.9729944	best: 0.9729944 (0)	total: 263ms	remaining: 2m 11s
100:	test: 0.9977810	best: 0.9979268 (95)	total: 9.59s	remaining: 37.9s
200:	test: 0.9983813	best: 0.9983834 (197)	total: 17.8s	remaining: 26.4s
300:	test: 0.9987694	best: 0.9987779 (298)	total: 25.7s	remaining: 17s
400:	test: 0.9986965	best: 0.9988037 (302)	total: 34.2s	remaining: 8.44s
499:	test: 0.9986664	best: 0.9988037 (302)	total: 38.9s	remaining: 0us

bestTest = 0.9988036559
bestIteration = 302

Shrink model to first 303 iterations.


CatBoostClassifier(depth=6, eval_metric='AUC', iterations=500, learning_rate=0.05, loss_function='Logloss', random_seed=42, scale_pos_weight=np.float64(252.24418604651163), verbose=100)

In [22]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

In [23]:
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob))

Confusion Matrix:

[[10832    15]
 [    2    41]]

Classification Report:

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     10847
         1.0       0.73      0.95      0.83        43

    accuracy                           1.00     10890
   macro avg       0.87      0.98      0.91     10890
weighted avg       1.00      1.00      1.00     10890


ROC-AUC Score: 0.9988036559245831


In [24]:
model.save_model("fraud_model.cbm")