In [154]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# show all cols in df
pd.set_option('display.max_columns', None)

In [155]:
def process_fraud_data(df):
    # Ensure 'trans_date_trans_time' is in datetime format
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['unix_time'] = pd.to_datetime(df['unix_time'], unit='s')

    # Get month of year, day of week, and hour of day for transactions
    df['trans_day_of_week'] = df['trans_date_trans_time'].dt.day_name()
    df['trans_month'] = df['trans_date_trans_time'].dt.month_name()

    # Create 'time_of_day' column
    df['time_of_day'] = (df['trans_date_trans_time'].dt.hour * 60 + df['trans_date_trans_time'].dt.minute) / (24 * 60)

    # Calculate age in years
    df['dob'] = pd.to_datetime(df['dob'])
    df['age'] = df['trans_date_trans_time'].dt.year - df['dob'].dt.year - (
        (df['trans_date_trans_time'].dt.month < df['dob'].dt.month) |
        ((df['trans_date_trans_time'].dt.month == df['dob'].dt.month) & (df['trans_date_trans_time'].dt.day < df['dob'].dt.day))
    )

    # Sort the DataFrame by 'cc_num' and 'trans_date_trans_time'
    df.sort_values(by=['cc_num', 'trans_date_trans_time'], inplace=True)

    # Group by 'cc_num' and calculate the time difference
    df['last_purchased'] = df.groupby('cc_num')['trans_date_trans_time'].diff()

    # For the first transaction for each 'cc_num', set 'last_purchased' to NaT
    df['last_purchased'] = df['last_purchased'].fillna(pd.NaT)

    # Convert 'last_purchased' to numerical format (total number of seconds)
    df['last_purchased_secs'] = df['last_purchased'].dt.total_seconds()

    # Define the Haversine distance function
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371  # Radius of the Earth in kilometers
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        return R * c

    # Calculate the distance to the last purchase
    df['last_merch_lat'] = df.groupby('cc_num')['merch_lat'].shift()
    df['last_merch_long'] = df.groupby('cc_num')['merch_long'].shift()

    df['distance_last_purchase'] = haversine(df['merch_lat'], df['merch_long'], df['last_merch_lat'], df['last_merch_long'])

    # For the first transaction for each 'cc_num', set 'distance_last_purchase' to NaN
    df['distance_last_purchase'] = df['distance_last_purchase'].fillna(np.nan)

    # Create 'same_merchant_with_last_purchase' column
    df['last_merchant'] = df.groupby('cc_num')['merchant'].shift()
    df['same_merchant_with_last_purchase'] = (df['merchant'] == df['last_merchant'])

    # Drop the temporary columns
    df.drop(columns=['last_merch_lat', 'last_merch_long', 'last_merchant'], inplace=True)

    # Drop the specified columns
    df.drop(columns=['last_purchased', 'city', 'street', 'last', 'first', 'job', 'cc_num', 'merchant', 'trans_date_trans_time', 'state', 'zip', 'lat', 'long', 'merch_lat', 'merch_long', 'trans_num', 'unix_time', 'dob'], inplace=True)

    # Sort the DataFrame by the original index to maintain the original order
    df.sort_index(inplace=True)

    return df

In [156]:
train = pd.read_csv('fraudTrain.csv', index_col=0)
test = pd.read_csv('fraudTest.csv', index_col=0)

In [157]:
train_preprocessed = process_fraud_data(train)
test_preprocessed = process_fraud_data(test)

In [158]:
train.head()

Unnamed: 0,category,amt,gender,city_pop,is_fraud,trans_day_of_week,trans_month,time_of_day,age,last_purchased_secs,distance_last_purchase,same_merchant_with_last_purchase
0,misc_net,4.97,F,3495,0,Tuesday,January,0.0,30,,,False
1,grocery_pos,107.23,F,149,0,Tuesday,January,0.0,40,,,False
2,entertainment,220.11,M,4154,0,Tuesday,January,0.0,56,,,False
3,gas_transport,45.0,M,1939,0,Tuesday,January,0.000694,51,,,False
4,misc_pos,41.96,M,99,0,Tuesday,January,0.002083,32,,,False


#### Extract only the first pruchases

In [159]:
# Keep rows that have NaN in last_purchased_secs
train_preprocessed = train_preprocessed[train_preprocessed['last_purchased_secs'].isna()]
test_preprocessed = test_preprocessed[test_preprocessed['last_purchased_secs'].isna()]

In [160]:
train_preprocessed

Unnamed: 0,category,amt,gender,city_pop,is_fraud,trans_day_of_week,trans_month,time_of_day,age,last_purchased_secs,distance_last_purchase,same_merchant_with_last_purchase
0,misc_net,4.97,F,3495,0,Tuesday,January,0.000000,30,,,False
1,grocery_pos,107.23,F,149,0,Tuesday,January,0.000000,40,,,False
2,entertainment,220.11,M,4154,0,Tuesday,January,0.000000,56,,,False
3,gas_transport,45.00,M,1939,0,Tuesday,January,0.000694,51,,,False
4,misc_pos,41.96,M,99,0,Tuesday,January,0.002083,32,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1186804,gas_transport,6.47,M,3733,1,Monday,May,0.061806,57,,,False
1190005,gas_transport,12.56,F,17081,1,Tuesday,May,0.031944,65,,,False
1230734,shopping_pos,1002.39,M,31702,1,Thursday,May,0.939583,86,,,False
1258483,shopping_pos,1324.80,F,777,1,Sunday,June,0.923611,23,,,False


In [161]:
test_preprocessed

Unnamed: 0,category,amt,gender,city_pop,is_fraud,trans_day_of_week,trans_month,time_of_day,age,last_purchased_secs,distance_last_purchase,same_merchant_with_last_purchase
0,personal_care,2.86,M,333497,0,Sunday,June,0.509722,52,,,False
1,personal_care,29.84,F,302,0,Sunday,June,0.509722,30,,,False
2,health_fitness,41.28,F,34496,0,Sunday,June,0.509722,49,,,False
3,misc_pos,60.05,M,54767,0,Sunday,June,0.510417,32,,,False
4,travel,3.19,M,1126,0,Sunday,June,0.510417,64,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...
327231,shopping_net,1041.54,F,837792,1,Saturday,October,0.856944,83,,,False
328127,gas_transport,12.40,F,1071,1,Sunday,October,0.289583,22,,,False
342954,travel,10.53,F,20226,1,Saturday,October,0.733333,50,,,False
422405,gas_transport,9.16,M,1920,1,Wednesday,December,0.004861,23,,,False


In [162]:
# unique values in each col
for col in train_preprocessed.columns:
    print(col, train_preprocessed[col].nunique())

category 14
amt 952
gender 2
city_pop 879
is_fraud 2
trans_day_of_week 7
trans_month 12
time_of_day 683
age 81
last_purchased_secs 0
distance_last_purchase 0
same_merchant_with_last_purchase 1


In [163]:
# Drop last_purchased_secs, distance_last_purchase, same_merchant_with_last_purchase
train_preprocessed = train_preprocessed.drop(columns=['last_purchased_secs', 'distance_last_purchase', 'same_merchant_with_last_purchase'])
test_preprocessed = test_preprocessed.drop(columns=['last_purchased_secs', 'distance_last_purchase', 'same_merchant_with_last_purchase'])
train_preprocessed.head()

Unnamed: 0,category,amt,gender,city_pop,is_fraud,trans_day_of_week,trans_month,time_of_day,age
0,misc_net,4.97,F,3495,0,Tuesday,January,0.0,30
1,grocery_pos,107.23,F,149,0,Tuesday,January,0.0,40
2,entertainment,220.11,M,4154,0,Tuesday,January,0.0,56
3,gas_transport,45.0,M,1939,0,Tuesday,January,0.000694,51
4,misc_pos,41.96,M,99,0,Tuesday,January,0.002083,32


In [164]:
train_preprocessed.dtypes

category              object
amt                  float64
gender                object
city_pop               int64
is_fraud               int64
trans_day_of_week     object
trans_month           object
time_of_day          float64
age                    int32
dtype: object

In [165]:
categorical_columns = train_preprocessed.select_dtypes(include=['object', 'category']).columns

# One-hot encode the categorical columns
train_preprocessed = pd.get_dummies(train_preprocessed, columns=categorical_columns)
test_preprocessed = pd.get_dummies(test_preprocessed, columns=categorical_columns)

# Ensure both train and test sets have the same columns after encoding
train_preprocessed, test_preprocessed = train_preprocessed.align(test_preprocessed, join='outer', axis=1, fill_value=0)

# Convert any boolean columns to integers (0 and 1)
bool_columns_train = train_preprocessed.select_dtypes(include=['bool']).columns
bool_columns_test = test_preprocessed.select_dtypes(include=['bool']).columns

train_preprocessed[bool_columns_train] = train_preprocessed[bool_columns_train].astype(int)
test_preprocessed[bool_columns_test] = test_preprocessed[bool_columns_test].astype(int)

train_preprocessed.head()

Unnamed: 0,age,amt,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,city_pop,gender_F,gender_M,is_fraud,time_of_day,trans_day_of_week_Friday,trans_day_of_week_Monday,trans_day_of_week_Saturday,trans_day_of_week_Sunday,trans_day_of_week_Thursday,trans_day_of_week_Tuesday,trans_day_of_week_Wednesday,trans_month_April,trans_month_August,trans_month_December,trans_month_February,trans_month_January,trans_month_July,trans_month_June,trans_month_March,trans_month_May,trans_month_November,trans_month_October,trans_month_September
0,30,4.97,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3495,1,0,0,0.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
1,40,107.23,0,0,0,0,1,0,0,0,0,0,0,0,0,0,149,1,0,0,0.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
2,56,220.11,1,0,0,0,0,0,0,0,0,0,0,0,0,0,4154,0,1,0,0.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
3,51,45.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1939,0,1,0,0.000694,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
4,32,41.96,0,0,0,0,0,0,0,0,0,1,0,0,0,0,99,0,1,0,0.002083,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0


In [166]:
# Count how many fraud and non-fraud transactions are in the train set and the test set
train_preprocessed['is_fraud'].value_counts(), test_preprocessed['is_fraud'].value_counts()

(is_fraud
 0    908
 1     75
 Name: count, dtype: int64,
 is_fraud
 0    904
 1     20
 Name: count, dtype: int64)

In [167]:
# Combine both train and test and split them again
combined = pd.concat([train_preprocessed, test_preprocessed], axis=0)
target = 'is_fraud'

X_combined = combined.drop(columns=[target])
y_combined = combined[target]

X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=41)

In [168]:
y_train.value_counts(), y_test.value_counts()

(is_fraud
 0    1451
 1      74
 Name: count, dtype: int64,
 is_fraud
 0    361
 1     21
 Name: count, dtype: int64)

### Random Forest

In [169]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# Count the number of fraudulent transactions identified by the model
num_fraud_identified = sum(y_pred == 1)

# Count the number of actual fraudulent transactions in the test set
num_actual_fraud = sum(y_test == 1)

print(f'Number of fraudulent transactions identified: {num_fraud_identified}')
print(f'Number of actual fraudulent transactions: {num_actual_fraud}')

Accuracy: 0.9973821989528796
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       361
           1       1.00      0.95      0.98        21

    accuracy                           1.00       382
   macro avg       1.00      0.98      0.99       382
weighted avg       1.00      1.00      1.00       382

Number of fraudulent transactions identified: 20
Number of actual fraudulent transactions: 21


In [170]:
# Concatenate X_test and y_test
test_df = X_test.copy()
test_df['Actual'] = y_test.values
test_df['Predicted'] = y_pred

# Filter the DataFrame to show only the rows where the model predicted fraud
fraudulent_transactions = test_df[test_df['Predicted'] == 1]

# Display the fraudulent transactions
fraudulent_transactions

Unnamed: 0,age,amt,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,city_pop,gender_F,gender_M,time_of_day,trans_day_of_week_Friday,trans_day_of_week_Monday,trans_day_of_week_Saturday,trans_day_of_week_Sunday,trans_day_of_week_Thursday,trans_day_of_week_Tuesday,trans_day_of_week_Wednesday,trans_month_April,trans_month_August,trans_month_December,trans_month_February,trans_month_January,trans_month_July,trans_month_June,trans_month_March,trans_month_May,trans_month_November,trans_month_October,trans_month_September,Actual,Predicted
1041055,57,303.31,0,0,0,0,1,0,0,0,0,0,0,0,0,0,14267,1,0,0.163889,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1
71934,23,284.58,0,0,0,0,1,0,0,0,0,0,0,0,0,0,12626,1,0,0.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1
812497,20,918.58,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1888,1,0,0.575694,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1
177985,17,9.47,0,0,1,0,0,0,0,0,0,0,0,0,0,0,13021,0,1,0.088194,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1
872268,21,970.6,0,0,0,0,0,0,0,0,0,0,0,0,1,0,12478,1,0,0.856944,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1
940752,85,928.47,0,0,0,0,0,0,0,0,0,0,0,1,0,0,68211,0,1,0.935417,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1
683679,23,711.63,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2906700,0,1,0.143056,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
93519,93,287.45,0,0,0,0,1,0,0,0,0,0,0,0,0,0,4431,1,0,0.019444,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1
132187,78,272.15,0,0,0,0,1,0,0,0,0,0,0,0,0,0,478404,0,1,0.038194,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1
1051726,51,269.15,0,0,0,0,1,0,0,0,0,0,0,0,0,0,64438,1,0,0.303472,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1


### XGBoost

In [171]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Initialize the XGBoost model
xgb_model = XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# Count the number of fraudulent transactions identified by the model
num_fraud_identified = sum(y_pred == 1)

# Count the number of actual fraudulent transactions in the test set
num_actual_fraud = sum(y_test == 1)

print(f'Number of fraudulent transactions identified: {num_fraud_identified}')
print(f'Number of actual fraudulent transactions: {num_actual_fraud}')

Accuracy: 0.9973821989528796
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       361
           1       0.95      1.00      0.98        21

    accuracy                           1.00       382
   macro avg       0.98      1.00      0.99       382
weighted avg       1.00      1.00      1.00       382

Number of fraudulent transactions identified: 22
Number of actual fraudulent transactions: 21


In [172]:
# Concatenate X_test and y_test
test_df = X_test.copy()
test_df['Actual'] = y_test.values
test_df['Predicted'] = y_pred

# Filter the DataFrame to show only the rows where the model predicted fraud
fraudulent_transactions = test_df[test_df['Predicted'] == 1]

# Display the fraudulent transactions
fraudulent_transactions

Unnamed: 0,age,amt,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,city_pop,gender_F,gender_M,time_of_day,trans_day_of_week_Friday,trans_day_of_week_Monday,trans_day_of_week_Saturday,trans_day_of_week_Sunday,trans_day_of_week_Thursday,trans_day_of_week_Tuesday,trans_day_of_week_Wednesday,trans_month_April,trans_month_August,trans_month_December,trans_month_February,trans_month_January,trans_month_July,trans_month_June,trans_month_March,trans_month_May,trans_month_November,trans_month_October,trans_month_September,Actual,Predicted
1041055,57,303.31,0,0,0,0,1,0,0,0,0,0,0,0,0,0,14267,1,0,0.163889,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1
71934,23,284.58,0,0,0,0,1,0,0,0,0,0,0,0,0,0,12626,1,0,0.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1
812497,20,918.58,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1888,1,0,0.575694,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1
177985,17,9.47,0,0,1,0,0,0,0,0,0,0,0,0,0,0,13021,0,1,0.088194,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1
872268,21,970.6,0,0,0,0,0,0,0,0,0,0,0,0,1,0,12478,1,0,0.856944,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1
940752,85,928.47,0,0,0,0,0,0,0,0,0,0,0,1,0,0,68211,0,1,0.935417,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1
7580,34,1067.73,0,0,0,0,0,0,0,0,0,0,0,1,0,0,299480,1,0,0.95625,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
683679,23,711.63,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2906700,0,1,0.143056,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
93519,93,287.45,0,0,0,0,1,0,0,0,0,0,0,0,0,0,4431,1,0,0.019444,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1
132187,78,272.15,0,0,0,0,1,0,0,0,0,0,0,0,0,0,478404,0,1,0.038194,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1


### Logistic Regression

In [175]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Initialize the Logistic Regression model
logreg_model = LogisticRegression(random_state=42, max_iter=1000)

# Train the model
logreg_model.fit(X_train, y_train)

# Make predictions
y_pred = logreg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# Count the number of fraudulent transactions identified by the model
num_fraud_identified = sum(y_pred == 1)

# Count the number of actual fraudulent transactions in the test set
num_actual_fraud = sum(y_test == 1)

print(f'Number of fraudulent transactions identified: {num_fraud_identified}')
print(f'Number of actual fraudulent transactions: {num_actual_fraud}')

Accuracy: 0.9450261780104712
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       361
           1       0.50      0.19      0.28        21

    accuracy                           0.95       382
   macro avg       0.73      0.59      0.62       382
weighted avg       0.93      0.95      0.93       382

Number of fraudulent transactions identified: 8
Number of actual fraudulent transactions: 21


In [176]:
# Concatenate X_test and y_test
test_df = X_test.copy()
test_df['Actual'] = y_test.values
test_df['Predicted'] = y_pred

# Filter the DataFrame to show only the rows where the model predicted fraud
fraudulent_transactions = test_df[test_df['Predicted'] == 1]

# Display the fraudulent transactions
fraudulent_transactions

Unnamed: 0,age,amt,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,city_pop,gender_F,gender_M,time_of_day,trans_day_of_week_Friday,trans_day_of_week_Monday,trans_day_of_week_Saturday,trans_day_of_week_Sunday,trans_day_of_week_Thursday,trans_day_of_week_Tuesday,trans_day_of_week_Wednesday,trans_month_April,trans_month_August,trans_month_December,trans_month_February,trans_month_January,trans_month_July,trans_month_June,trans_month_March,trans_month_May,trans_month_November,trans_month_October,trans_month_September,Actual,Predicted
812497,20,918.58,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1888,1,0,0.575694,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1
872268,21,970.6,0,0,0,0,0,0,0,0,0,0,0,0,1,0,12478,1,0,0.856944,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1
7580,34,1067.73,0,0,0,0,0,0,0,0,0,0,0,1,0,0,299480,1,0,0.95625,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
683679,23,711.63,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2906700,0,1,0.143056,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
232,49,1055.47,0,0,0,0,0,0,0,0,0,0,0,1,0,0,648,0,1,0.120833,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
428,31,1881.53,0,0,0,0,0,0,0,0,0,0,0,0,1,0,530,1,0,0.609722,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
211823,20,739.85,0,0,0,0,0,0,0,0,1,0,0,0,0,0,493806,1,0,0.031944,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
767,25,420.98,1,0,0,0,0,0,0,0,0,0,0,0,0,0,22191,1,0,0.690972,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1


### SVM

In [177]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Initialize the SVM model
svm_model = SVC(random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# Count the number of fraudulent transactions identified by the model
num_fraud_identified = sum(y_pred == 1)

# Count the number of actual fraudulent transactions in the test set
num_actual_fraud = sum(y_test == 1)

print(f'Number of fraudulent transactions identified: {num_fraud_identified}')
print(f'Number of actual fraudulent transactions: {num_actual_fraud}')

Accuracy: 0.9450261780104712
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       361
           1       1.00      0.00      0.00        21

    accuracy                           0.95       382
   macro avg       0.97      0.50      0.49       382
weighted avg       0.95      0.95      0.92       382

Number of fraudulent transactions identified: 0
Number of actual fraudulent transactions: 21


In [178]:
# Concatenate X_test and y_test
test_df = X_test.copy()
test_df['Actual'] = y_test.values
test_df['Predicted'] = y_pred

# Filter the DataFrame to show only the rows where the model predicted fraud
fraudulent_transactions = test_df[test_df['Predicted'] == 1]

# Display the fraudulent transactions
fraudulent_transactions

Unnamed: 0,age,amt,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,city_pop,gender_F,gender_M,time_of_day,trans_day_of_week_Friday,trans_day_of_week_Monday,trans_day_of_week_Saturday,trans_day_of_week_Sunday,trans_day_of_week_Thursday,trans_day_of_week_Tuesday,trans_day_of_week_Wednesday,trans_month_April,trans_month_August,trans_month_December,trans_month_February,trans_month_January,trans_month_July,trans_month_June,trans_month_March,trans_month_May,trans_month_November,trans_month_October,trans_month_September,Actual,Predicted


### XGBoost is the Best Model