In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [82]:
import pickle

import warnings 
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)

In [83]:
train_path = r'D:/Data Science/Big Data Technology/Project/Streaming-Fraud-Detection/Streaming-Fraud-Detection/data/raw/fraudTrain.csv'
test_path = r'D:/Data Science/Big Data Technology/Project/Streaming-Fraud-Detection/Streaming-Fraud-Detection/data/raw/fraudTest.csv'   

In [84]:
df_train = pd.read_csv(train_path, low_memory=False, index_col=0)
df_test = pd.read_csv(test_path, low_memory=False, index_col=0)

df = pd.concat([df_train, df_test],ignore_index=True)

In [85]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [86]:
df.shape

(1852394, 22)

In [87]:
df.is_fraud.value_counts()

is_fraud
0    1842743
1       9651
Name: count, dtype: int64

In [88]:
df = df.drop(['first', 'last', 'street', 'city', 'state', 'zip', 'trans_num'],axis=1)
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,gender,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,M,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,46.2306,-112.1138,1939,Patent attorney,1967-01-12,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,M,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,1325376186,38.674999,-78.632459,0


In [89]:
# Transforming the transaction date to datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'])

df['age'] = df['trans_date_trans_time'].dt.year - df['dob'].dt.year

df['hour'] = df['trans_date_trans_time'].dt.hour
df['day'] = df['trans_date_trans_time'].dt.day
df['month'] = df['trans_date_trans_time'].dt.month

### Encoding object columns

In [90]:
import os
encoder_dir = r"D:\Data Science\Big Data Technology\Project\Streaming-Fraud-Detection\Streaming-Fraud-Detection\Encoder"
encoder_path = os.path.join(encoder_dir, "LE_model_v1.pkl")

os.makedirs(encoder_dir, exist_ok=True)

cols = ['merchant', 'category', 'gender', 'job']

def encode(df):
    df_obj = df.select_dtypes(include=['object'])
    encoders = {}
    for col in cols:
        encoder = LabelEncoder()
        df[col + '_indexer'] = encoder.fit_transform(df[col])
        encoders[col] = encoder
    
    with open(encoder_path, 'wb') as f:
        pickle.dump(encoders, f)
    
    return df

df = encode(df)
df = df.drop(cols, axis=1)

print(f"Encoders saved at: {encoder_path}")

Encoders saved at: D:\Data Science\Big Data Technology\Project\Streaming-Fraud-Detection\Streaming-Fraud-Detection\Encoder\LE_model_v1.pkl


In [91]:
df = df.sort_values(by='trans_date_trans_time')
df = df.drop(columns=['trans_date_trans_time','dob'], axis=1)
df.head()

Unnamed: 0,cc_num,amt,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,age,hour,day,month,merchant_indexer,category_indexer,gender_indexer,job_indexer
0,2703186189652095,4.97,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0,31,0,1,1,514,8,0,372
1,630423337322,107.23,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0,41,0,1,1,241,4,0,431
2,38859492057661,220.11,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0,57,0,1,1,390,0,1,308
3,3534093764340240,45.0,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0,52,0,1,1,360,2,1,330
4,375534208663984,41.96,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0,33,0,1,1,297,9,1,116


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1852394 entries, 0 to 1852393
Data columns (total 17 columns):
 #   Column            Dtype  
---  ------            -----  
 0   cc_num            int64  
 1   amt               float64
 2   lat               float64
 3   long              float64
 4   city_pop          int64  
 5   unix_time         int64  
 6   merch_lat         float64
 7   merch_long        float64
 8   is_fraud          int64  
 9   age               int32  
 10  hour              int32  
 11  day               int32  
 12  month             int32  
 13  merchant_indexer  int64  
 14  category_indexer  int64  
 15  gender_indexer    int64  
 16  job_indexer       int64  
dtypes: float64(5), int32(4), int64(8)
memory usage: 226.1 MB


### Spliting dataset intro train and test

In [92]:
split_ratio = 0.8
split_index = int(len(df) * split_ratio)

train_df = df.iloc[:split_index]
test_df = df.iloc[split_index:]

X_train = train_df.drop(columns=['is_fraud'])
y_train = train_df['is_fraud']
X_test = test_df.drop(columns=['is_fraud'])
y_test = test_df['is_fraud']


In [None]:
#train_df.to_csv('clean_train.csv', index=False)
#test_df.to_csv('clean_test.csv', index=False)

In [97]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(1481915, 16) (370479, 16)
(1481915,) (370479,)
