In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
from scipy import stats
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.preprocessing import LabelEncoder
import pytz
import math
import datetime

In [21]:
def is_us_tran(row):
    """
    Using rough estimates for continental America to 
    detemine if the merchant longitutde and lattitude are outside of the US
    Feature potential relevance - Foriegn vendors may indicate fraud
    Params : 
    Row : Current row (sample) of the dataframe
    
    Returns :
    1 (int) : integer boolean representation for a transaction inside the continental US
    0 (int) : integer boolean representation for a transaction outside the continental US
    """

    # Retrieve lattitude and longitude of merchant 
    merch_long = row['merch_long']
    merch_lat = row['merch_lat']

    # Rough estimate for ranges of US continental lat and long
    lat_min, lat_max = 24.0, 50.0
    long_min, long_max = -125.0, -67.0
    if lat_min <= merch_lat <= lat_max and long_min <= merch_long <= long_max:
        return 1
    return 0  

In [22]:
def suspicious_cat(row):
    """
    Binary feature if the category of transaction is suspicious (shopping or misc)
    Params : 
    Row : Current row (sample) of the dataframe
    
    Returns :
    1 (int) : integer boolean representation for containing this type of transction 
    0 (int) : integer boolean representation for not containing this type of transction 
    """
    # Init suspicious category
    suspicious = ['misc', 'shopping']
    # Test if matching
    category = row['category']
    if any(substring in category.lower() for substring in suspicious):
        return 1
    return 0

In [23]:
def extract_time_feats(credit_data):
    """
    Extracts features based on the time of the transaction
    Params : 
    credit_data (df) : Dataframe of transactions labeled with fraud 
    
    Returns :
    credit_data (df) : Dataframe of transactions labeled with fraud with new features
    """
    # Time features
    # Turn into datetime object
    credit_data['datetime'] = pd.to_datetime(credit_data['unix_time'], unit='s')
    # Feature potential relevance - Certain hours (late at night or early in morning) may indicate fraud
    credit_data['hour'] = credit_data['datetime'].dt.hour 
    # Feature potential relevance - Could represent payday cycles or holidays
    credit_data['day_of_month'] = credit_data['datetime'].dt.day
    # Feature potential relevance - Certain Months (during holiday shopping season) may have more fraud
    credit_data['month'] = credit_data['datetime'].dt.month
    # Feature potential relevance - Older years may indicate fraud ( years of economic turbulence)
    credit_data['year'] = credit_data['datetime'].dt.year
    # Feature potential relevance - Certain Quarters (during holiday shopping season) may have more fraud
    credit_data['quarter'] = credit_data['datetime'].dt.quarter  # 1, 2, 3, or 4
    
    # Feature potential relevance - Older people may have an increased chance of being targeted by a fraud attack
    credit_data['dob'] = pd.to_datetime(credit_data['dob'])
    current_date = datetime.datetime.now()
    credit_data['age'] = credit_data['dob'].apply(lambda x: (current_date - x).days // 365)
    
    return credit_data



In [24]:

def feats(credit_data):
    """
    Extracts created features
    Params : 
    credit_data (df) : Dataframe of transactions labeled with fraud 
    
    Returns :
    credit_data (df) : Dataframe of transactions labeled with fraud with new features
    """
    # Encoder for categories and profiles
    le = LabelEncoder()
    # Feature potential relevance - Transaction categories like shopping or misc may be linked to credit fraud
    credit_data['category_cat'] = le.fit_transform(credit_data['category'])
    # Feature potential relevance - Profiles with certain characteristics may have a higher occurence of fraud
    credit_data['profile_cat'] = le.fit_transform(credit_data['category'])
    
    # Feature potential relevance - Transactions outside of the US may indicate fraud
    credit_data['is_in_us'] = credit_data.apply(is_us_tran, axis=1)
    
    # Feature potential relevance - Transactions of shopping and miscellanous 
    credit_data['suspicious_cat'] = credit_data.apply(suspicious_cat, axis=1)   
    # Frequency counts of profiles and merchants
    # Feature potential relevance - Common merchants lead to common transactions 
    credit_data['merchant_freq_encoded'] = credit_data['merchant'].map(credit_data['merchant'].value_counts(normalize=True))
    
    return credit_data



In [25]:
def remove_irr(credit_data):
    """
    Removes features deeemed irrelevant 
    Params : 
    credit_data (df) : Dataframe of transactions labeled with fraud 
    
    Returns :
    credit_data (df) : Dataframe of transactions labeled with fraud with removed features
    """
    # List of columns that are less relevant for fraud detection
    irr_col= [
        'ssn', 'cc_num', 'first', 'last', 'street', 'state',
        'zip', 'dob', 'acct_num', 'profile', 'trans_num', 'merch_lat', 'merch_long', "city", 'city_pop',
        "job", "merchant", "trans_date", "trans_time", 'gender', 'lat', 'long', 'category', 'unix_time', 'datetime'
    ]
    
    # Drop these columns from the DataFrame
    credit_data = credit_data.drop(columns=irr_col)
    print("Remaining columns:", credit_data.columns)
    return credit_data



In [26]:
# Data split
# Preproccess the data for training
credit_data= pd.read_csv('cct_train.csv')
credit_data = extract_time_feats(credit_data)
credit_data = feats(credit_data)
credit_data = remove_irr(credit_data)

# Split x and y
X = credit_data.drop(columns = ['is_fraud'])
y = credit_data['is_fraud']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Remaining columns: Index(['amt', 'is_fraud', 'hour', 'day_of_month', 'month', 'year', 'quarter',
       'age', 'category_cat', 'profile_cat', 'is_in_us', 'suspicious_cat',
       'merchant_freq_encoded'],
      dtype='object')


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

In [28]:
# Random
fraud_model = RandomForestClassifier(bootstrap=False, criterion= 'gini',
                                     max_depth=20, max_features='sqrt', n_estimators=100,
                                     min_samples_leaf=1, min_samples_split=2,
                                     random_state=17, n_jobs=-1)

In [29]:
fraud_model.fit(X_train, y_train)
train_accuracy = accuracy_score(y_train, fraud_model.predict(X_train))
print(train_accuracy)

0.9999286536043307


In [30]:

pred = fraud_model.predict(X_test)

In [31]:
report = classification_report(y_test, pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    139446
           1       1.00      0.89      0.94       716

    accuracy                           1.00    140162
   macro avg       1.00      0.94      0.97    140162
weighted avg       1.00      1.00      1.00    140162



In [32]:
def feat_importances(fraud_model):
    feature_importances = fraud_model.feature_importances_
    feat_imp = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print(feat_imp)


In [33]:
scores = cross_validate(fraud_model, X_scaled, y, cv=5, scoring='f1')
print(scores)

{'fit_time': array([2.593539  , 2.50084949, 2.49715877, 2.65340996, 2.57307696]), 'score_time': array([0.247715  , 0.15176415, 0.1965878 , 0.20690536, 0.24695158]), 'test_score': array([0.93422025, 0.93156733, 0.9379616 , 0.92798812, 0.91398654])}


In [34]:
import joblib
fraud_model.fit(X_scaled, y)
# Assuming your model is called `model`
joblib.dump(fraud_model, 'task_1_model.pkl')

['task_1_model.pkl']