**RULE 003**

In [59]:
import pandas as pd

# Load the dataset
df = pd.read_csv("dataset_1.csv")
print("Number of entries:", df.shape[0])
# Drop 'latitude' and 'longitude' features
df = df.drop(columns=["Unnamed: 0",'latitude', 'longitude'])

# Check for NaN values in the DataFrame
null_columns = df.columns[df.isnull().any()]
print("Columns with NaN values:")
print(null_columns)


Number of entries: 1296675
Columns with NaN values:
Index([], dtype='object')


In [131]:
from sklearn.utils import resample

# Assuming you have a DataFrame 'data' with columns 'merchant_category_code', 'transaction_amount', and 'is_fraud'

# Count the occurrences of each label
label_counts = df['is_fraud'].value_counts()

# Find the minimum count among all labels
min_count = label_counts.min()

# Separate data into fraud and non-fraud instances
fraud_data = df[df['is_fraud'] == 1]
non_fraud_data = df[df['is_fraud'] == 0]

# Undersample the non-fraud instances to match the number of fraud instances
undersampled_non_fraud_data = resample(non_fraud_data, replace=False, n_samples=min_count * 3, random_state=42)

# Combine the undersampled non-fraud instances with the fraud instances
undersampled_data = pd.concat([fraud_data, undersampled_non_fraud_data])

# Now undersampled_data contains the balanced dataset


i-forest

In [132]:
undersampled_data

Unnamed: 0,encryptedHexCardNo,merchantCategoryCode,transactionAmount,dateTimeTransaction,is_fraud
2449,4613314721966,4,281.06,1325466397,1
2472,340187018810220,2,11.52,1325468849,1
2523,340187018810220,4,276.31,1325473523,1
2546,4613314721966,2,7.03,1325475483,1
2553,340187018810220,4,275.73,1325476547,1
...,...,...,...,...,...
468746,4481131401752,0,25.78,1343249747,0
1101209,675990301623,4,105.49,1364964195,0
385377,60422928733,10,2.01,1340642180,0
556874,4904681492230012,7,77.84,1345917308,0


In [139]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import recall_score

# Load the dataset
df = pd.read_csv("dataset_1.csv")

# Drop 'latitude' and 'longitude' features
df = df.drop(columns=["Unnamed: 0",'latitude', 'longitude'])
print(df.head())

# Separate features and target variable
X = undersampled_data.drop(columns=['is_fraud'])  # Features
y = undersampled_data['is_fraud']  # Target variable

# Split the dataset into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, train_size=0.8, random_state=42)

# Train the Isolation Forest model
iforest = IsolationForest(n_estimators=100)
iforest.fit(X_train)

# Evaluate the model considering 'IsFraud' during testing
y_pred_test = iforest.predict(X_test)
y_pred_test[y_pred_test == 1] = 0  # Convert inliers (non-fraud) to 0
y_pred_test[y_pred_test == -1] = 1  # Convert outliers (fraud) to 1

# Calculate recall for fraud detection
recall = recall_score(y_test, y_pred_test)
print("Recall for fraud detection:", recall)


   encryptedHexCardNo  merchantCategoryCode  transactionAmount  \
0    2703186189652095                     8               4.97   
1        630423337322                     4             107.23   
2      38859492057661                     0             220.11   
3    3534093764340240                     2              45.00   
4     375534208663984                     9              41.96   

   dateTimeTransaction  is_fraud  
0           1325376018         0  
1           1325376044         0  
2           1325376051         0  
3           1325376076         0  
4           1325376186         0  
Recall for fraud detection: 0.6625


In [141]:
data = {
    "encryptedHexCardNo": 2703186189652095,
    "merchantCategoryCode": 8,
    "transactionAmount" : 4.97,
    "dateTimeTransaction": 1325376018,

}

df_test_1 = pd.DataFrame(data, index=[0])


result for query

In [142]:
y_pred = iforest.predict(df_test_1)
# Convert predicted labels to fraud (1) or non-fraud (0)
if y_pred !=1:
    prediction = "Fraud"
else:
    prediction = "Non-fraud"

print("Prediction:", prediction)

Prediction: Fraud


RULE004

In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import recall_score , confusion_matrix

# Load the dataset
df = pd.read_csv("dataset_1.csv")

# Drop 'latitude' and 'longitude' features
df = df.drop(columns=['Unnamed: 0','latitude', 'longitude', 'encryptedHexCardNo'])
print(df.head())

# Separate features and target variable
X = df.drop(columns=['is_fraud'])  # Features
y = df['is_fraud']  # Target variable

# Split the dataset into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, train_size=0.8, random_state=42)


# Train the Isolation Forest model
iforest_merchan = IsolationForest(n_estimators=100)
iforest_merchan.fit(X_train)

# Evaluate the model considering 'IsFraud' during testing
y_pred_test = iforest_merchan.predict(X_test)
y_pred_test[y_pred_test == 1] = 0  # Convert inliers (non-fraud) to 0
y_pred_test[y_pred_test == -1] = 1  # Convert outliers (fraud) to 1

# Calculate recall for fraud detection
recall = recall_score(y_test, y_pred_test)
print("Recall for fraud detection:", recall)
# Compute confusion matrix
conf_mat = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:\n", conf_mat)

   merchantCategoryCode  transactionAmount  dateTimeTransaction  is_fraud
0                     8               4.97           1325376018         0
1                     4             107.23           1325376044         0
2                     0             220.11           1325376051         0
3                     2              45.00           1325376076         0
4                     9              41.96           1325376186         0
Recall for fraud detection: 0.8223684210526315
Confusion Matrix:
 [[18622  7160]
 [   27   125]]


In [119]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA

# Load the dataset
df = pd.read_csv("dataset_1.csv")

# Drop unnecessary features
df = df.drop(columns=['Unnamed: 0', 'latitude', 'longitude'])

# Extract features from 'encryptedHexCardNo' column
# For example, you can extract the length of the hexadecimal string
df['encryptedHexCardNo_length'] = df['encryptedHexCardNo'].apply(lambda x: len(str(x)))

# Separate features and target variable
X = df.drop(columns=['is_fraud'])  # Features
y = df['is_fraud']  # Target variable

# Split the dataset into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, train_size=0.8, random_state=42)

# Perform dimensionality reduction using PCA
pca = PCA(n_components=2, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train the Local Outlier Factor (LOF) model
lof = LocalOutlierFactor(n_neighbors=10, novelty=True)
lof.fit(X_train_pca)

# Evaluate the LOF model
y_pred_test = lof.predict(X_test_pca)

# Print the results or perform further analysis


In [120]:
data = {
    "merchantCategoryCode": 8,
    "transactionAmount" : 1000000000,
    "dateTimeTransaction": 1325376018,

}

df_test_1 = pd.DataFrame(data, index=[0])


In [121]:
df_test_1

Unnamed: 0,merchantCategoryCode,transactionAmount,dateTimeTransaction
0,8,1000000000,1325376018


In [122]:
y_pred = iforest_merchan.predict(df_test_1)
# Convert predicted labels to fraud (1) or non-fraud (0)
if y_pred == 1:
    prediction = "Fraud"
else:
    prediction = "Non-fraud"

print("Prediction:", prediction)

Prediction: Non-fraud
