**Import all necessary libraries**

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [None]:
# upload pickle file 1

df1 = pd.read_pickle("/content/drive/MyDrive/2018-07-25.pkl")
print(df1.head())   # first 5 rows
print(df1.shape)    # rows x columns
print(df1.columns)

         TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  \
1102483         1102483 2018-07-25 00:00:29        1111        2328   
1102484         1102484 2018-07-25 00:01:08         676        6846   
1102485         1102485 2018-07-25 00:01:35         402        4771   
1102486         1102486 2018-07-25 00:01:43        4218         863   
1102487         1102487 2018-07-25 00:02:26        3711        3599   

         TX_AMOUNT TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
1102483      40.77         9936029          115         0                  0  
1102484       9.62         9936068          115         0                  0  
1102485      81.55         9936095          115         0                  0  
1102486      23.10         9936103          115         0                  0  
1102487      59.25         9936146          115         0                  0  
(9541, 9)
Index(['TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID',
       'TX_AMOUNT',

In [None]:
# Import pickle file 2

df2 = pd.read_pickle("/content/drive/MyDrive/2018-07-26.pkl")
print(df2.head())
print(df2.shape)
print(df2.columns)

         TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  \
1112024         1112024 2018-07-26 00:01:41        3619        6969   
1112025         1112025 2018-07-26 00:02:58        1037        7562   
1112026         1112026 2018-07-26 00:03:03        4657        6540   
1112027         1112027 2018-07-26 00:04:01        3209        8836   
1112028         1112028 2018-07-26 00:04:57        3281        3396   

         TX_AMOUNT TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
1112024      42.97        10022501          116         0                  0  
1112025      60.87        10022578          116         0                  0  
1112026      18.60        10022583          116         0                  0  
1112027      90.94        10022641          116         0                  0  
1112028      11.88        10022697          116         0                  0  
(9787, 9)
Index(['TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID',
       'TX_AMOUNT',

**merging both dataset**

In [None]:
final_df = pd.concat([df1, df2], ignore_index=True)

print(final_df.shape)
print(final_df.head())
print(final_df.tail())


(19328, 9)
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0         1102483 2018-07-25 00:00:29        1111        2328      40.77   
1         1102484 2018-07-25 00:01:08         676        6846       9.62   
2         1102485 2018-07-25 00:01:35         402        4771      81.55   
3         1102486 2018-07-25 00:01:43        4218         863      23.10   
4         1102487 2018-07-25 00:02:26        3711        3599      59.25   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0         9936029          115         0                  0  
1         9936068          115         0                  0  
2         9936095          115         0                  0  
3         9936103          115         0                  0  
4         9936146          115         0                  0  
       TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
19323         1121806 2018-07-26 23:55:08         161        9854     121.09   


In [None]:
print("data shape:", final_df.shape)
print("data columns:", final_df.columns)

data shape: (19328, 9)
data columns: Index(['TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID',
       'TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 'TX_FRAUD',
       'TX_FRAUD_SCENARIO'],
      dtype='object')


**feature and target split**

In [None]:
X = final_df.drop(["TX_FRAUD", "TX_DATETIME", "TRANSACTION_ID", "CUSTOMER_ID", "TERMINAL_ID"],axis=1)
y = final_df["TX_FRAUD"]


**train test split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


**Model training**

In [25]:
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3830
           1       1.00      1.00      1.00        36

    accuracy                           1.00      3866
   macro avg       1.00      1.00      1.00      3866
weighted avg       1.00      1.00      1.00      3866



**Model Evaluation**

In [20]:
# Accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print('Accuracy on training data:', training_data_accuracy)


Accuracy on training data: 1.0


In [21]:
# accuract on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(y_test, X_test_prediction)
print('Accuracy on testing data:', testing_data_accuracy)


Accuracy on testing data: 1.0


In [27]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='f1')
print("Cross-validated F1-scores:", scores)
print("Mean F1-score:", scores.mean())


Cross-validated F1-scores: [1. 1. 1. 1. 1.]
Mean F1-score: 1.0


In [29]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))


[[3830    0]
 [   0   36]]


In [22]:
print(final_df['TX_FRAUD'].value_counts())


TX_FRAUD
0    19147
1      181
Name: count, dtype: int64


**Builidng a predictive system**

In [33]:
# Input data for one transaction
input_data = ( (40.77, 9936029, 115, 0))

# Convert to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape for single prediction
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Predict
prediction = model.predict(input_data_reshaped)

if prediction[0] == 0:
    print("Transaction is NORMAL")
else:
    print("Transaction is FRAUD")


Transaction is NORMAL




In [None]:
joblib.dump(model, "fraud_detection_model.pkl")
print("\n Model saved as fraud_detection_model.pkl")



 Model saved as fraud_detection_model.pkl
