In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import datetime

# Loading the Datasets 

In [2]:
train_data = pd.read_csv('fraudTrain.csv')
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [3]:
test_data = pd.read_csv('fraudTest.csv')
test_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

### There are no null values therefore 

In [6]:
train_data.duplicated().sum()

0

In [7]:
test_data.duplicated().sum()

0

## Combining the datasets using concatenate 

In [8]:
df = pd.concat([train_data, test_data], axis=0, ignore_index=True)

In [9]:
print("\nOriginal shapes:")
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(f"Combined data shape: {df.shape}")


Original shapes:
Train data shape: (1296675, 23)
Test data shape: (555719, 23)
Combined data shape: (1852394, 23)


In [10]:
df.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 23 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Unnamed: 0             int64  
 1   trans_date_trans_time  object 
 2   cc_num                 int64  
 3   merchant               object 
 4   category               object 
 5   amt                    float64
 6   first                  object 
 7   last                   object 
 8   gender                 object 
 9   street                 object 
 10  city                   object 
 11  state                  object 
 12  zip                    int64  
 13  lat                    float64
 14  long                   float64
 15  city_pop               int64  
 16  job                    object 
 17  dob                    object 
 18  trans_num              object 
 19  unix_time              int64  
 20  merch_lat              float64
 21  merch_long             float64
 22  is_fraud          

In [12]:
# Getting class distribution in combined dataset
print("\nFraud distribution in combined dataset:")
print(df['is_fraud'].value_counts(normalize=True) * 100)


Fraud distribution in combined dataset:
is_fraud
0    99.478999
1     0.521001
Name: proportion, dtype: float64


# Preprocessing the data 

In [13]:
def preprocess_data(df):
    # Convert date-time
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    
    # Extract time-based features
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
    
    # Calculate distance between merchant and customer
    df['distance'] = np.sqrt(
        (df['lat'] - df['merch_lat'])**2 + 
        (df['long'] - df['merch_long'])**2
    )
    
    # Select features for modeling
    features = [
        'amt', 'hour', 'day_of_week', 'distance',
        'city_pop', 'merch_lat', 'merch_long'
    ]
    
    return df[features], df['is_fraud']

In [14]:
X, y = preprocess_data(df)

## Splitting the data 

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scaling features 

In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Using undersampling for balancing the data 

In [17]:
from imblearn.under_sampling import RandomUnderSampler

# Apply undersampling to balance the classes
print("Applying undersampling for class balance...")
undersampler = RandomUnderSampler(random_state=42)
X_train_balanced, y_train_balanced = undersampler.fit_resample(X_train_scaled, y_train)

Applying undersampling for class balance...


# Training Models 

In [18]:
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced', max_depth=10),
    'Random Forest': RandomForestClassifier(class_weight='balanced', n_estimators=100)
}

In [19]:
results = {}
print("\nTraining and evaluating models ")
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train_balanced, y_train_balanced)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    results[name] = {
        'classification_report': classification_report(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_pred)
    }
    
    # Print results
    print(f"\n{name} Results:")
    print("\nClassification Report:")
    print(results[name]['classification_report'])
    print("\nConfusion Matrix:")
    print(results[name]['confusion_matrix'])
    print(f"\nROC AUC Score: {results[name]['roc_auc']:.4f}")



Training and evaluating models 

Training Logistic Regression...

Logistic Regression Results:

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97    368549
           1       0.08      0.77      0.14      1930

    accuracy                           0.95    370479
   macro avg       0.54      0.86      0.56    370479
weighted avg       0.99      0.95      0.97    370479


Confusion Matrix:
[[350877  17672]
 [   447   1483]]

ROC AUC Score: 0.8602

Training Decision Tree...

Decision Tree Results:

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.92      0.96    368549
           1       0.06      0.94      0.12      1930

    accuracy                           0.93    370479
   macro avg       0.53      0.93      0.54    370479
weighted avg       0.99      0.93      0.96    370479


Confusion Matrix:
[[340893  27656]
 [   122   1808]]

ROC AUC Score: 0.93

## Hyperparameter Tuning 

In [21]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize the model
rf = RandomForestClassifier(class_weight='balanced')

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_balanced, y_train_balanced)

# Best parameters found by GridSearchCV
print(f"Best parameters for Random Forest: {grid_search.best_params_}")

# Evaluate with best model
best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test_scaled)
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_rf)}")


Best parameters for Random Forest: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 100}
ROC AUC: 0.9362837917925355


In [22]:
# Best parameters for Random Forest
best_params_rf = {
    'n_estimators': 100, 
    'max_depth': 30, 
    'min_samples_split': 2
}

# Initializing a new Random Forest model with best parameters
rf_model = RandomForestClassifier(**best_params_rf, class_weight='balanced', random_state=42)

# Training the model with the balanced data
rf_model.fit(X_train_balanced, y_train_balanced)

# Make predictions
y_pred_rf = rf_model.predict(X_test_scaled)

# Calculate evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Print results for Random Forest
print("Random Forest (Best Params) Results:")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print(f"\nROC AUC Score: {roc_auc_score(y_test, y_pred_rf):.4f}")


Random Forest (Best Params) Results:

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.93      0.97    368549
           1       0.07      0.94      0.13      1930

    accuracy                           0.93    370479
   macro avg       0.53      0.94      0.55    370479
weighted avg       0.99      0.93      0.96    370479


Confusion Matrix:
[[344002  24547]
 [   119   1811]]

ROC AUC Score: 0.9359
