In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

sns.set_style('whitegrid')
sns.set_palette('pastel')

import warnings
warnings.simplefilter("ignore")

In [None]:
test_df = pd.read_csv('../data/fraudTest.csv', index_col='Unnamed: 0')
train_df = pd.read_csv('../data/fraudTrain.csv', index_col='Unnamed: 0')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1296675 entries, 0 to 1296674
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   cc_num                 1296675 non-null  int64  
 2   merchant               1296675 non-null  object 
 3   category               1296675 non-null  object 
 4   amt                    1296675 non-null  float64
 5   first                  1296675 non-null  object 
 6   last                   1296675 non-null  object 
 7   gender                 1296675 non-null  object 
 8   street                 1296675 non-null  object 
 9   city                   1296675 non-null  object 
 10  state                  1296675 non-null  object 
 11  zip                    1296675 non-null  int64  
 12  lat                    1296675 non-null  float64
 13  long                   1296675 non-null  float64
 14  city_pop               

In [6]:
import pandas as pd

# Function to process JSON data and combine it with train_df data
def process_json_and_search(json_data, train_df):
    # Convert JSON data to a DataFrame
    input_data = pd.DataFrame([json_data])

    # Extract the 'cc_num' field
    cc_num = input_data.loc[0, 'cc_num']

    # Check if the 'cc_num' exists in train_df
    if cc_num in train_df['cc_num'].values:
        # Get all records related to the 'cc_num'
        related_records = train_df[train_df['cc_num'] == cc_num]

        # Add the JSON data to the related records
        combined_df = pd.concat([related_records, input_data], ignore_index=True)
        return combined_df
    else:
        return f"No records found for cc_num: {cc_num}"

# Example train_df (replace with your actual dataset)
train_df = pd.DataFrame({
    "trans_date_trans_time": ["2025-04-16 12:34:56", "2025-04-16 13:22:11"],
    "cc_num": [1234567890123456, 2345678901234567],
    "merchant": ["Amazon", "Walmart"],
    "category": ["electronics", "grocery_pos"],
    "amt": [250.75, 45.00],
    "lat": [40.7128, 34.0522],
    "long": [-74.0060, -118.2437],
    "merch_lat": [40.7306, 34.0525],
    "merch_long": [-73.9352, -118.2430],
    "unix_time": [1650102896, 1650105731]
})

# Example JSON data
json_data = {
    "trans_date_trans_time": "2025-04-16 12:35:56",
    "cc_num": 1234567890123456,
    "merchant": "Amazon",
    "category": "electronics",
    "amt": 251.75,
    "lat": 40.7128,
    "long": -74.0060,
    "merch_lat": 40.7306,
    "merch_long": -73.9352,
    "unix_time": 1650102896
}

# Apply the function
result = process_json_and_search(json_data, train_df)

# Display the result
result

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,lat,long,merch_lat,merch_long,unix_time
0,2025-04-16 12:34:56,1234567890123456,Amazon,electronics,250.75,40.7128,-74.006,40.7306,-73.9352,1650102896
1,2025-04-16 12:35:56,1234567890123456,Amazon,electronics,251.75,40.7128,-74.006,40.7306,-73.9352,1650102896


In [3]:
#Create a transformer class for preprocessing
class DataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_keep):
        self.cols_to_keep = cols_to_keep

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Keep the specified columns
        X = X[self.cols_to_keep]
        # Sort by 'cc_num' and 'trans_date_trans_time'
        X = X.sort_values(by=['cc_num', 'trans_date_trans_time']).reset_index(drop=True)
        # Convert 'trans_date_trans_time' to datetime
        X['trans_date_trans_time'] = pd.to_datetime(X['trans_date_trans_time'])
        return X

# Define the pipeline steps
cols_to_keep = ['cc_num', 'trans_date_trans_time', 'category', 'amt', 'lat', 'long', 'merch_lat', 'merch_long', 'unix_time', 'is_fraud']

pipeline = Pipeline([
    ('preprocessor', DataPreprocessor(cols_to_keep=cols_to_keep))
])

# Fit and transform the data using the pipeline
transformed_df = pipeline.fit_transform(train_df)

# Inspect the transformed data
transformed_df.head()

Unnamed: 0,cc_num,trans_date_trans_time,category,amt,lat,long,merch_lat,merch_long,unix_time,is_fraud
0,60416207185,2019-01-01 12:47:15,misc_net,7.27,43.0048,-108.8964,43.974711,-109.741904,1325422035,0
1,60416207185,2019-01-02 08:44:57,gas_transport,52.94,43.0048,-108.8964,42.018766,-109.044172,1325493897,0
2,60416207185,2019-01-02 08:47:36,gas_transport,82.08,43.0048,-108.8964,42.961335,-109.157564,1325494056,0
3,60416207185,2019-01-02 12:38:14,kids_pets,34.79,43.0048,-108.8964,42.228227,-108.747683,1325507894,0
4,60416207185,2019-01-02 13:10:46,home,27.18,43.0048,-108.8964,43.321745,-108.091143,1325509846,0


In [4]:
# Step 1: Define a transformer for adding average columns
class AddAverageColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Create columns for the hour and the date (day) of each transaction
        X['hour'] = X['trans_date_trans_time'].dt.hour
        X['date'] = X['trans_date_trans_time'].dt.date

        # Hourly Averages
        X['cumulative_sum_hour'] = X.groupby(['cc_num', 'date', 'hour'])['amt'].cumsum()
        X['transaction_count'] = X.groupby(['cc_num', 'date', 'hour']).cumcount() + 1
        X['avg_amt'] = round(X['cumulative_sum_hour'] / X['transaction_count'], 2)

        # Daily Averages
        X['Total_amt_d'] = X.groupby(['cc_num', 'date'])['amt'].cumsum()
        X['transaction_count_d'] = X.groupby(['cc_num', 'date']).cumcount() + 1
        X['avg_amt_d'] = round(X['Total_amt_d'] / X['transaction_count_d'], 2)

        # Drop intermediate columns if not needed
        X.drop(columns=['cumulative_sum_hour', 'date'], inplace=True)

        return X

# Step 2: Define the pipeline
pipeline = Pipeline([
    ('add_avg_columns', AddAverageColumns())
])

# Step 3: Use the pipeline on the dataset
transformed_df = pipeline.fit_transform(transformed_df)

# Inspect the transformed data
transformed_df.head()

Unnamed: 0,cc_num,trans_date_trans_time,category,amt,lat,long,merch_lat,merch_long,unix_time,is_fraud,hour,transaction_count,avg_amt,Total_amt_d,transaction_count_d,avg_amt_d
0,60416207185,2019-01-01 12:47:15,misc_net,7.27,43.0048,-108.8964,43.974711,-109.741904,1325422035,0,12,1,7.27,7.27,1,7.27
1,60416207185,2019-01-02 08:44:57,gas_transport,52.94,43.0048,-108.8964,42.018766,-109.044172,1325493897,0,8,1,52.94,52.94,1,52.94
2,60416207185,2019-01-02 08:47:36,gas_transport,82.08,43.0048,-108.8964,42.961335,-109.157564,1325494056,0,8,2,67.51,135.02,2,67.51
3,60416207185,2019-01-02 12:38:14,kids_pets,34.79,43.0048,-108.8964,42.228227,-108.747683,1325507894,0,12,1,34.79,169.81,3,56.6
4,60416207185,2019-01-02 13:10:46,home,27.18,43.0048,-108.8964,43.321745,-108.091143,1325509846,0,13,1,27.18,196.99,4,49.25


In [5]:
# Haversine function
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees or in radians)
    """
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2 - lat1) / 2.0) ** 2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2 - lon1) / 2.0) ** 2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

# Custom transformer for haversine and processing
class HaversineAndProcessDataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, lat_col, long_col, merch_lat_col, merch_long_col, group_col):
        self.lat_col = lat_col
        self.long_col = long_col
        self.merch_lat_col = merch_lat_col
        self.merch_long_col = merch_long_col
        self.group_col = group_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Calculate distance between transaction and merchant
        X['dist'] = haversine(
            X[self.lat_col],
            X[self.long_col],
            X[self.merch_lat_col],
            X[self.merch_long_col]
        )
        # Calculate distance from the previous transaction
        X['prev_dist'] = X.groupby(self.group_col)['dist'].shift(1)
        # Fill NaN values for the first transaction with the current distance
        X['prev_dist'].fillna(X['dist'], inplace=True)
        # Calculate the difference between current and previous distances
        X['dist_diff'] = abs(X['dist'] - X['prev_dist'])
        # Drop unnecessary columns
        X.drop(columns=[self.lat_col, self.long_col, self.merch_lat_col, self.merch_long_col, 'prev_dist'], inplace=True)
        return X

# Define the pipeline
pipeline = Pipeline([
    ('haversine_and_processing', HaversineAndProcessDataTransformer(
        lat_col='lat', long_col='long', 
        merch_lat_col='merch_lat', merch_long_col='merch_long', 
        group_col='cc_num'
    ))
])

# Apply the pipeline to your dataset
transformed_df = pipeline.fit_transform(transformed_df)

# Inspect the transformed data
transformed_df.head()

Unnamed: 0,cc_num,trans_date_trans_time,category,amt,unix_time,is_fraud,hour,transaction_count,avg_amt,Total_amt_d,transaction_count_d,avg_amt_d,dist,dist_diff
0,60416207185,2019-01-01 12:47:15,misc_net,7.27,1325422035,0,12,1,7.27,7.27,1,7.27,127.606239,0.0
1,60416207185,2019-01-02 08:44:57,gas_transport,52.94,1325493897,0,8,1,52.94,52.94,1,52.94,110.308921,17.297318
2,60416207185,2019-01-02 08:47:36,gas_transport,82.08,1325494056,0,8,2,67.51,135.02,2,67.51,21.787261,88.52166
3,60416207185,2019-01-02 12:38:14,kids_pets,34.79,1325507894,0,12,1,34.79,169.81,3,56.6,87.204215,65.416954
4,60416207185,2019-01-02 13:10:46,home,27.18,1325509846,0,13,1,27.18,196.99,4,49.25,74.212965,12.99125


In [None]:
# Step 1: Define the function for categorizing part of the day
def categorize_part_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    elif 21 <= hour or hour < 5:
        return 'Night'

# Step 2: Create a transformer for part of the day logic and one-hot encoding
class PartOfDayTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Categorize part of the day based on the hour column
        X['part_of_day'] = X['hour'].apply(categorize_part_of_day)
        
        # # Optionally add the "is_weekend" column
        # X['is_weekend'] = X['trans_date_trans_time'].dt.weekday.isin([5, 6]).astype(int)
        
        # One-hot encode the 'part_of_day' column
        one_hot_encoded = pd.get_dummies(X['part_of_day'], prefix='D', drop_first=True)
        X = pd.concat([X, one_hot_encoded.astype(int)], axis=1)
        
        # Drop the original 'part_of_day' column if not needed
        X.drop(columns=['part_of_day'], inplace=True)
        return X

# Step 3: Define the pipeline
pipeline = Pipeline([
    ('part_of_day_categorization', PartOfDayTransformer())
])

# Step 4: Apply the pipeline to your dataset
transformed_df = pipeline.fit_transform(train_df)

# Inspect the transformed data
transformed_df.head()

Unnamed: 0,cc_num,trans_date_trans_time,amt,unix_time,is_fraud,hour,transaction_count,avg_amt,Total_amt_d,transaction_count_d,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,60416207185,2019-01-01 12:47:15,7.27,1325422035,0,12,1,7.27,7.27,1,...,0,0,0,0,1,0,0,0,0,0
1,60416207185,2019-01-02 08:44:57,52.94,1325493897,0,8,1,52.94,52.94,1,...,0,0,0,0,0,0,0,0,0,0
2,60416207185,2019-01-02 08:47:36,82.08,1325494056,0,8,2,67.51,135.02,2,...,0,0,0,0,0,0,0,0,0,0
3,60416207185,2019-01-02 12:38:14,34.79,1325507894,0,12,1,34.79,169.81,3,...,0,0,0,1,0,0,0,0,0,0
4,60416207185,2019-01-02 13:10:46,27.18,1325509846,0,13,1,27.18,196.99,4,...,0,0,1,0,0,0,0,0,0,0


In [7]:
# Step 1: Transformer for category one-hot encoding
class CategoryOneHotTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        category_onehot = pd.get_dummies(X['category'], prefix='category', drop_first=True)
        X = pd.concat([X, category_onehot], axis=1)
        X.drop(columns=['category'], inplace=True)
        X = pd.concat([X, category_onehot.astype(int)], axis=1)
        return X

# Step 2: Define the final pipeline
pipeline = Pipeline([
    ('category_onehot_encoding', CategoryOneHotTransformer())
])

# Step 3: Apply the pipeline to your dataset
transformed_df = pipeline.fit_transform(transformed_df)

# Inspect the transformed data
transformed_df.head()

Unnamed: 0,cc_num,trans_date_trans_time,amt,unix_time,is_fraud,hour,transaction_count,avg_amt,Total_amt_d,transaction_count_d,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,60416207185,2019-01-01 12:47:15,7.27,1325422035,0,12,1,7.27,7.27,1,...,0,0,0,0,1,0,0,0,0,0
1,60416207185,2019-01-02 08:44:57,52.94,1325493897,0,8,1,52.94,52.94,1,...,0,0,0,0,0,0,0,0,0,0
2,60416207185,2019-01-02 08:47:36,82.08,1325494056,0,8,2,67.51,135.02,2,...,0,0,0,0,0,0,0,0,0,0
3,60416207185,2019-01-02 12:38:14,34.79,1325507894,0,12,1,34.79,169.81,3,...,0,0,0,1,0,0,0,0,0,0
4,60416207185,2019-01-02 13:10:46,27.18,1325509846,0,13,1,27.18,196.99,4,...,0,0,1,0,0,0,0,0,0,0
