In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Function to calculate the Haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Difference in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    # Haversine formula
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    # Distance in kilometers
    distance = R * c
    return distance


def process(df):
    # Add new features
    date_time = pd.to_datetime(df['trans_date_trans_time'], format='%d/%m/%Y %H:%M')
    birth_date = pd.to_datetime(df['dob'], format='%d/%m/%Y')

    df['trans_hour'] = date_time.dt.hour
    df['trans_day_of_week'] = date_time.dt.dayofweek
    
    df['age'] = (date_time - birth_date).dt.days // 365
    df['trans_dist'] = haversine_distance(df['lat'], df['long'], df['merch_lat'], df['merch_long'])

    user_avg_amt = df.groupby('cc_num')['amt'].mean().reset_index(name='Avg_Amt')
    df = df.merge(user_avg_amt, on='cc_num')
    df['Relative_Amt'] = abs(df['amt'] - df['Avg_Amt']) / df['Avg_Amt']

    df.drop(columns=['trans_date_trans_time', 'lat', 'long', 'merch_lat', 'merch_long', 'Avg_Amt'], inplace=True)

    # Identifying categorical columns
    categorical_cols = ['merchant', 'category', 'gender', 'city', 'state', 'job']

    mappings = {}

    label_encoder = LabelEncoder()
    for col in categorical_cols:
        df[col] = label_encoder.fit_transform(df[col])
        mappings[col] = {label: index for index, label in enumerate(label_encoder.classes_)}

    return df, mappings

trainingSet = pd.read_csv("./data/train.csv")
submissionSet = pd.read_csv("./data/test.csv")
train_processed, cat_map = process(trainingSet)
train_processed.drop(columns=['cc_num', 'first', 'last', 'street', 'dob', 'zip', 'trans_num', 'unix_time'], inplace=True)

# Merge on Id so that the test set can have feature columns as well
test_df= pd.merge(train_processed, submissionSet, left_on='Id', right_on='Id')
test_df = test_df.drop(columns=['is_fraud_x'])
test_df = test_df.rename(columns={'is_fraud_y': 'is_fraud'})

# The training set is where the score is not null
train_df = train_processed[train_processed['is_fraud'].notnull()]

# Save the datasets with the new features for easy access later
# test_df.to_csv("./data/test_processed.csv", index=False)
# train_df.to_csv("./data/train_processed.csv", index=False)


In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

scaled_cols = ['amt', 'city_pop', 'trans_dist']
# Assuming 'train_df' includes both features and the target ('is_fraud')
X = train_df.drop(['is_fraud', 'Id'], axis=1)  # Dropping the target to isolate features
y = train_df['is_fraud']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pickle

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Parameters for GridSearchCV
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# GridSearchCV
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, scoring='f1_micro', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Validation
best_dt = grid_search.best_estimator_

with open('dt_model3.obj', 'wb') as f:
        pickle.dump(best_dt, f)


y_pred = best_dt.predict(X_val)

val_f1 = f1_score(y_val, y_pred)

(best_params, best_score, val_f1)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.997 total time=   0.7s
[CV 4/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5;, score=0.997 total time=   0.7s
[CV 1/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5;, score=0.997 total time=   0.7s
[CV 5/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.997 total time=   0.7s
[CV 2/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5;, score=0.997 total time=   0.7s
[CV 2/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.997 total time=   0.7s
[CV 3/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.997 total time=   0.8s
[CV 5/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5;, score=0.997 total time=   0.7s
[CV 4/5] END crite



[CV 1/5] END criterion=gini, max_depth=5, min_samples_leaf=4, min_samples_split=2;, score=0.998 total time=   1.1s
[CV 2/5] END criterion=gini, max_depth=5, min_samples_leaf=4, min_samples_split=2;, score=0.998 total time=   1.2s
[CV 3/5] END criterion=gini, max_depth=5, min_samples_leaf=4, min_samples_split=2;, score=0.998 total time=   1.2s
[CV 4/5] END criterion=gini, max_depth=5, min_samples_leaf=4, min_samples_split=2;, score=0.998 total time=   1.2s
[CV 5/5] END criterion=gini, max_depth=5, min_samples_leaf=4, min_samples_split=2;, score=0.998 total time=   1.2s
[CV 1/5] END criterion=gini, max_depth=5, min_samples_leaf=4, min_samples_split=5;, score=0.998 total time=   1.1s
[CV 2/5] END criterion=gini, max_depth=5, min_samples_leaf=4, min_samples_split=5;, score=0.998 total time=   1.1s
[CV 3/5] END criterion=gini, max_depth=5, min_samples_leaf=4, min_samples_split=5;, score=0.998 total time=   1.2s
[CV 1/5] END criterion=gini, max_depth=5, min_samples_leaf=4, min_samples_split=

({'criterion': 'entropy',
  'max_depth': 10,
  'min_samples_leaf': 4,
  'min_samples_split': 2},
 0.9986632493836278,
 0.7957957957957958)

In [4]:
pred = test_df.drop(['is_fraud', 'Id'], axis=1)
pred2 = test_df.drop(['is_fraud'], axis=1)

pred2['is_fraud'] = best_dt.predict(pred)
pred2.is_fraud = pred2.is_fraud.astype(int)
submission = pred2[['Id', 'is_fraud']]
submission.to_csv("./data/submission2.csv", index=False)