In [1]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from sklearn.preprocessing import OneHotEncoder

train_data = pd.read_csv('./cs-506-extra-credit/train.csv')
df = train_data

print(train_data.isnull().sum())

train_data.drop(columns=['id', 'trans_num'], inplace=True)

duplicate_rows = train_data.duplicated().sum()

df['trans_date'] = pd.to_datetime(df['trans_date'])
df['trans_time'] = pd.to_datetime(df['trans_time'])

df['have_risk_day'] = ((df['trans_date'] >= pd.to_datetime('2024-01-02')) & 
                 (df['trans_date'] <= pd.to_datetime('2024-01-25'))).astype(int)

df['hour'] = df['trans_time'].dt.hour  
df['hight_risk_hour'] = ((df['hour'] < 4) | (df['hour'] > 21)).astype(int)

df.drop(columns=['hour'], inplace=True)

df['dob'] = pd.to_datetime(df['dob'])
df['age'] = (df['trans_date'] - df['dob']).dt.days // 365
df = df.drop(['trans_date', 'trans_time'], axis=1)

cc_risk_rate = df.groupby('cc_num')['is_fraud'].mean()

df['cc_risk_rate'] = df['cc_num'].map(cc_risk_rate)

df['name'] = df['first'] + ' ' + df['last']

name_risk_rate = df.groupby('name')['is_fraud'].mean()

df['name_risk_rate'] = df['name'].map(name_risk_rate)

city_risk_rate = df.groupby('city')['is_fraud'].mean()

state_risk_rate = df.groupby('state')['is_fraud'].mean()

df['city_risk_rate'] = df['city'].map(city_risk_rate)
df['state_risk_rate'] = df['state'].map(state_risk_rate)

merchant_risk_rate = df.groupby('merchant')['is_fraud'].mean()

df['merchant_risk_rate'] = df['merchant'].map(merchant_risk_rate)

df['amt_risk_rate'] = 0 
df.loc[(df['amt'] > 0) & (df['amt'] <= 300), 'amt_risk_rate'] = 0.5
df.loc[(df['amt'] > 300) & (df['amt'] <= 1400), 'amt_risk_rate'] = 1

df['distance'] = df.apply(
    lambda row: geodesic((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).km,
    axis=1
)

encoder = OneHotEncoder(sparse_output=False)
features_to_encode = ['gender', 'category', 'merchant'] 

encoded = encoder.fit_transform(df[features_to_encode])
encoded_df = pd.DataFrame(
    encoded,
    columns=encoder.get_feature_names_out(features_to_encode)
)

df = pd.concat([df.drop(columns=features_to_encode), encoded_df], axis=1)

print(df.head())


id            0
trans_num     0
trans_date    0
trans_time    0
unix_time     0
category      0
amt           0
cc_num        0
first         0
last          0
gender        0
street        0
city          0
state         0
zip           0
lat           0
long          0
city_pop      0
job           0
dob           0
merchant      0
merch_lat     0
merch_long    0
is_fraud      0
dtype: int64


  df['trans_time'] = pd.to_datetime(df['trans_time'])
  df.loc[(df['amt'] > 0) & (df['amt'] <= 300), 'amt_risk_rate'] = 0.5


    unix_time     amt            cc_num    first     last  \
0  1704887379  188.38      676355457570   Andrea  Johnson   
1  1704526670  102.63   377178373574671   Rhonda   Chavez   
2  1705632021    1.62  3599292013370451  Stephen     Khan   
3  1705872015    5.64  3594292572430345   Justin   Reilly   
4  1705883786   97.09  4867547663675548    Alice   Duarte   

                 street        city state    zip      lat  ...  \
0    036 Mercer Orchard  Belleville    IL  62220  38.5127  ...   
1     1918 William Isle  Sykesville    MD  21784  39.4567  ...   
2   850 Mccarthy Rapids     Gaylord    MI  49735  45.0125  ...   
3  7493 Jennifer Greens      Medina    OH  44256  41.1404  ...   
4      37855 Faith Fork     Burbank    CA  91501  34.1862  ...   

   merchant_fraud_Yost, Schamberger and Windler  merchant_fraud_Yost-Rogahn  \
0                                           0.0                         0.0   
1                                           0.0                         0.0   

In [2]:
from sklearn.model_selection import train_test_split

object_features = df.select_dtypes(include=['object']).columns.tolist()

numeric_features = df.select_dtypes(include=['float64', 'int64', 'int']).columns.tolist()
numeric_features = [x for x in numeric_features if x not in ['is_fraud', 'dob']]

print("Object type features:")
print(object_features)

print("\nNumeric type features (float and int):")
print(numeric_features)

X = df[numeric_features]
y = df['is_fraud']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=123)

Object type features:
['first', 'last', 'street', 'city', 'state', 'job', 'name']

Numeric type features (float and int):
['unix_time', 'amt', 'cc_num', 'zip', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long', 'have_risk_day', 'hight_risk_hour', 'age', 'cc_risk_rate', 'name_risk_rate', 'city_risk_rate', 'state_risk_rate', 'merchant_risk_rate', 'amt_risk_rate', 'distance', 'gender_F', 'gender_M', 'category_entertainment', 'category_food_dining', 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos', 'category_health_fitness', 'category_home', 'category_kids_pets', 'category_misc_net', 'category_misc_pos', 'category_personal_care', 'category_shopping_net', 'category_shopping_pos', 'category_travel', 'merchant_fraud_Abbott-Rogahn', 'merchant_fraud_Abbott-Steuber', 'merchant_fraud_Abernathy and Sons', 'merchant_fraud_Abshire PLC', 'merchant_fraud_Adams, Kovacek and Kuhlman', 'merchant_fraud_Adams-Barrows', 'merchant_fraud_Altenwerth, Cartwright and Koss', 'merchant_f

In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from catboost import CatBoostClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from geopy.distance import geodesic
import warnings
import re
warnings.filterwarnings('ignore')

def clean_feature_names(features):
    cleaned_features = []
    for feature in features:
        cleaned = re.sub(r'[^A-Za-z0-9_]', '_', str(feature))
        if cleaned[0].isdigit():
            cleaned = 'f_' + cleaned
        cleaned_features.append(cleaned)
    return cleaned_features

def evaluate_model(y_true, y_pred, y_pred_proba=None):
    results = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1': f1_score(y_true, y_pred)
    }
    if y_pred_proba is not None:
        results['AUC-ROC'] = roc_auc_score(y_true, y_pred_proba)
    return results

X_train.columns = clean_feature_names(X_train.columns)
X_val.columns = clean_feature_names(X_val.columns)

print("Training CatBoost...")
cb_model = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.03,
    depth=9,
    verbose=100,
    random_seed=123
)
cb_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=50,
    verbose=100
)

y_pred_cb = cb_model.predict(X_val)
y_pred_proba_cb = cb_model.predict_proba(X_val)[:, 1]
print("\nCatBoost Results:")
print(evaluate_model(y_val, y_pred_cb, y_pred_proba_cb))

def process_test_data(df):

    df['have_risk_day'] = ((df['trans_date'] >= pd.to_datetime('2024-01-02')) & 
                          (df['trans_date'] <= pd.to_datetime('2024-01-25'))).astype(int)

    df['hour'] = pd.to_datetime(df['trans_time']).dt.hour
    df['hight_risk_hour'] = ((df['hour'] < 4) | (df['hour'] > 21)).astype(int)
    df.drop(columns=['hour'], inplace=True)

    df['age'] = (pd.to_datetime(df['trans_date']) - pd.to_datetime(df['dob'])).dt.days // 365

    df = df.drop(['trans_date', 'trans_time', 'dob'], axis=1)

    df['cc_risk_rate'] = df['cc_num'].map(train_cc_risk_rate)
    df['name'] = df['first'] + ' ' + df['last']
    df['name_risk_rate'] = df['name'].map(train_name_risk_rate)
    df['city_risk_rate'] = df['city'].map(city_risk_rate)
    df['state_risk_rate'] = df['state'].map(state_risk_rate)
    df['merchant_risk_rate'] = df['merchant'].map(merchant_risk_rate)

    df['lat_long'] = df['lat'].astype(str) + ' ,' + df['long'].astype(str)
    df['location'] = df['street'] + ' ' + df['city'] + '' + df['state']

    df['amt_risk_rate'] = 0
    df.loc[(df['amt'] > 0) & (df['amt'] <= 300), 'amt_risk_rate'] = 0.5
    df.loc[(df['amt'] > 300) & (df['amt'] <= 1400), 'amt_risk_rate'] = 1

    encoder = OneHotEncoder(sparse_output=False)
    features_to_encode = ['gender', 'category', 'merchant']
    encoded = encoder.fit_transform(df[features_to_encode])
    encoded_df = pd.DataFrame(
        encoded,
        columns=encoder.get_feature_names_out(features_to_encode)
    )
    df = pd.concat([df.drop(columns=features_to_encode), encoded_df], axis=1)

    df['distance'] = df.apply(
        lambda row: geodesic((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).km,
        axis=1
    )

    return df

print("\nProcessing test data...")
test_df = pd.read_csv('./cs-506-extra-credit/test.csv')
test_df['trans_date'] = pd.to_datetime(test_df['trans_date'], errors='coerce')
test_df['trans_time'] = pd.to_datetime(test_df['trans_time'], errors='coerce')
train_cc_risk_rate = df.groupby('cc_num')['is_fraud'].mean()
train_name_risk_rate = df.groupby('name')['is_fraud'].mean()

processed_test_df = process_test_data(test_df)

num_features_test = processed_test_df.select_dtypes(include=['float64', 'int64', 'int']).columns.tolist()
num_features_test.remove('id')
X_test = processed_test_df[num_features_test]

X_test.columns = clean_feature_names(X_test.columns)

print("\nPredicting test data...")
test_predictions = cb_model.predict(X_test)

submission = pd.DataFrame({'id': processed_test_df['id'], 'is_fraud': test_predictions})
submission.to_csv('submission.csv', index=False)
print("\nPredictions saved to submission.csv")


Training CatBoost...
0:	learn: 0.6218801	test: 0.6220469	best: 0.6220469 (0)	total: 36.5ms	remaining: 1m 13s
100:	learn: 0.0419813	test: 0.0418387	best: 0.0418387 (100)	total: 3.53s	remaining: 1m 6s
200:	learn: 0.0272644	test: 0.0273338	best: 0.0273338 (200)	total: 7.09s	remaining: 1m 3s
300:	learn: 0.0229778	test: 0.0233415	best: 0.0233415 (300)	total: 10.8s	remaining: 1m 1s
400:	learn: 0.0210465	test: 0.0218212	best: 0.0218212 (400)	total: 14.3s	remaining: 57s
500:	learn: 0.0198925	test: 0.0210900	best: 0.0210900 (500)	total: 17.7s	remaining: 52.9s
600:	learn: 0.0192118	test: 0.0207213	best: 0.0207213 (600)	total: 21s	remaining: 48.9s
700:	learn: 0.0185974	test: 0.0203813	best: 0.0203811 (698)	total: 24.5s	remaining: 45.5s
800:	learn: 0.0181312	test: 0.0201726	best: 0.0201719 (797)	total: 28s	remaining: 41.9s
900:	learn: 0.0176111	test: 0.0199250	best: 0.0199231 (891)	total: 31.3s	remaining: 38.2s
1000:	learn: 0.0170419	test: 0.0196132	best: 0.0196132 (1000)	total: 34.7s	remaining: 3