In [None]:
#!pip install scipy
#!pip install scikit-learn
#!pip install imbalanced-learn

In [None]:
import scipy
import sklearn
import imblearn
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression=

# **Data Understanding and Preprocessing**

## **Data exploration**

In [None]:
import pandas as pd

# Load the dataset to analyze its contents
dataset = pd.read_csv('fraud_detection_data.csv')

In [None]:
# Check the distribution of fraudulent vs. legitimate transactions
fraud_distribution = dataset['Is Fraudulent'].value_counts()
print(fraud_distribution)

Is Fraudulent
Yes    183
No     180
Name: count, dtype: int64


## **Handle any missing or inconsistent data**

In [None]:
import warnings
# Check for missing or inconsistent data
missing_data = dataset.isnull().sum()
print(missing_data)

Transaction ID        0
Customer ID           0
Transaction Date      0
Transaction Amount    0
Merchant              0
Location              0
Transaction Type      0
Card Type             0
Is Fraudulent         0
dtype: int64


In [None]:
dataset['Transaction Date'] = pd.to_datetime(dataset['Transaction Date'])

## **Feature Engineering**

### **First Feature**

In [None]:
dataset['Transaction Hour'] = dataset['Transaction Date'].dt.hour
def time_of_day(hour):
    if 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 16:
        return 'Afternoon'
    elif 16 <= hour < 20:
        return 'Evening'
    elif 20 <= hour < 24:
        return 'Night'
    else:
        return 'Mid Night'

# Apply binning based on the hour
dataset['Transaction Time'] = dataset['Transaction Hour'].apply(time_of_day)


In [None]:
dataset['Transaction Time'].value_counts()

Unnamed: 0_level_0,count
Transaction Time,Unnamed: 1_level_1
Mid Night,96
Morning,79
Evening,66
Afternoon,62
Night,60


### **Second Feature**

In [None]:
def categorize_period(day_of_week):
    if day_of_week == 0:  # Monday
        return 'Mon'
    elif day_of_week == 1:  # Tuesday
        return 'Tues'
    elif day_of_week == 2:  # Wednesday
        return 'Wed'
    elif day_of_week == 3:  # Thursday
        return 'Thu'
    elif day_of_week == 4:  # Friday
        return 'Fri'
    elif day_of_week == 5:  # Saturday
        return 'Sat'
    elif day_of_week == 6:  # Sunday
        return 'Sun'

# Apply the categorization function based on the day of the week
dataset['Week Day'] = dataset['Transaction Date'].dt.dayofweek.apply(categorize_period)

In [None]:
dataset['Week Day'].value_counts()

Unnamed: 0_level_0,count
Week Day,Unnamed: 1_level_1
Tues,56
Sat,55
Thu,54
Sun,53
Fri,52
Mon,50
Wed,43


### **THIRD FEATURE**

In [None]:
dataset['Transaction Amount'].describe()

Unnamed: 0,Transaction Amount
count,363.0
mean,977.349972
std,581.386431
min,14.29
25%,476.7
50%,992.1
75%,1469.63
max,1994.68


In [None]:
dataset['Transaction_Amount_Bin'] = pd.cut(dataset['Transaction Amount'], bins=[0, 500, 1000, 1500, 2000], labels=['Low', 'Medium', 'High', 'Very High'])

In [None]:
dataset['Transaction_Amount_Bin'].value_counts()

Unnamed: 0_level_0,count
Transaction_Amount_Bin,Unnamed: 1_level_1
Low,99
High,97
Very High,84
Medium,83


### **FOURTH FEATURE**

In [None]:
dataset['Amount_to_Mean_Ratio'] = dataset['Transaction Amount'] / dataset['Transaction Amount'].mean()

### **FIFTH FEATURE**

In [None]:
dataset['Amount_to_Global_StdDev_Ratio'] = dataset['Transaction Amount'] / dataset['Transaction Amount'].std()

### **SIXTH FEATURE**

In [None]:
fraud_dataset = dataset[dataset['Is Fraudulent'] == 'Yes']
transaction_type_risk = {}

for i in dataset['Transaction Type'].unique():
    transaction_type_risk[i] = (fraud_dataset[fraud_dataset['Transaction Type'] == i].shape[0]) / (dataset[dataset['Transaction Type'] == i].shape[0])
transaction_type_risk

{'Online Purchase': 0.504,
 'In-Store Purchase': 0.4260869565217391,
 'ATM Withdrawal': 0.5772357723577236}

In [None]:
fraud_dataset = dataset[dataset['Is Fraudulent'] == 'Yes']
Card_type_risk = {}

for i in dataset['Card Type'].unique():
    Card_type_risk[i] = (fraud_dataset[fraud_dataset['Card Type'] == i].shape[0]) / (dataset[dataset['Card Type'] == i].shape[0])
Card_type_risk

{'MasterCard': 0.5052631578947369,
 'Discover': 0.5869565217391305,
 'Amex': 0.43529411764705883,
 'Visa': 0.4835164835164835}

In [None]:
# Function to calculate the total risk for each row
def calculate_risk(row):
    transaction_risk = transaction_type_risk.get(row['Transaction Type'], 0)  # Default to 0 if not found
    card_risk = Card_type_risk.get(row['Card Type'], 0)  # Default to 0 if not found
    return card_risk + transaction_risk

dataset['Risk'] = dataset.apply(calculate_risk, axis=1)

## **Label Encoding**

In [None]:
dataset = dataset.drop(columns = ['Transaction ID', 'Customer ID', 'Transaction Date',
       'Transaction Amount', 'Transaction Hour'])

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


df_encoded = pd.get_dummies(dataset, columns=['Transaction Type', 'Card Type', 'Transaction Time', 'Week Day'], drop_first=True)
mapping = {'Low': 0, 'Medium': 1, 'High': 2, 'Very High': 3}
df_encoded['Transaction_Amount_Bin'] = dataset['Transaction_Amount_Bin'].map(mapping)
df_encoded['Transaction_Amount_Bin'] = df_encoded['Transaction_Amount_Bin'].astype(int)
df_encoded['Is Fraudulent'] = dataset['Is Fraudulent'].apply(lambda x: 1 if x == 'Yes' else 0)
df_encoded = df_encoded.drop(columns = ['Merchant', 'Location'])


## **Feature Selection**

In [None]:
correlation_matrix = df_encoded.corr()
correlation_with_label = correlation_matrix['Is Fraudulent']

In [None]:
abs(correlation_with_label).sort_values(ascending=False)

Unnamed: 0,Is Fraudulent
Is Fraudulent,1.0
Risk,0.162321
Transaction Time_Mid Night,0.144956
Amount_to_Mean_Ratio,0.132346
Amount_to_Global_StdDev_Ratio,0.132346
Transaction_Amount_Bin,0.129347
Transaction Time_Night,0.122347
Transaction Type_In-Store Purchase,0.106295
Card Type_Discover,0.096519
Week Day_Sat,0.050292


In [None]:
cols = [
    "Risk",
    "Transaction Time_Mid Night",
    "Amount_to_Mean_Ratio",
    "Amount_to_Global_StdDev_Ratio",
    "Transaction_Amount_Bin",
    "Transaction Time_Night",
    "Transaction Type_In-Store Purchase",
    "Card Type_Discover","Week Day_Thu","Week Day_Mon", "Transaction Time_Morning",
]

## **MODEL DEVELOPMENT**

In [None]:
X = df_encoded[cols]
y = df_encoded['Is Fraudulent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# List of models to evaluate
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    #'SVM': SVC(random_state=42),
    #'K-Nearest Neighbors' : KNeighborsClassifier(),
    'Naive Bayes' : GaussianNB(),
}

# Loop through each model, train it, and evaluate
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("-" * 30)

Logistic Regression Performance:
Accuracy: 0.6712
Precision: 0.6970
Recall: 0.6216
F1 Score: 0.6571
------------------------------
Naive Bayes Performance:
Accuracy: 0.6849
Precision: 0.7188
Recall: 0.6216
F1 Score: 0.6667
------------------------------
