In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Step 1: Uploaded my dataset
data = pd.read_csv('edited_dataset.csv')

# Step 2: Preprocess the data

# The column 'isFraud' is the target variable.

X = data.iloc[:, :-2]  # Features
y = data.iloc[:, -2]   # Target variables (0 for non fraud and 1 for fraud)

# Identify categorical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Apply Label Encoding to categorical features
for feature in categorical_features:
    le = LabelEncoder()
    X[feature] = le.fit_transform(X[feature])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Step 3: Train the Random Forest model

# The class_weight parameter in RandomForestClassifier to give more importance to the minority class (fraud cases).

rf_model = RandomForestClassifier(n_estimators=100, random_state=42,class_weight='balanced')
rf_model.fit(X_train, y_train)


# Step 4: Make predictions on the test set

# By default, the model predicts a positive class if the probability is greater than 0.5
# So I lowered this threshold to improve the recall from 0.5 to 0.3

y_pred_prob = rf_model.predict_proba(X_test)[:, 1]
threshold = 0.3
y_pred = (y_pred_prob >= threshold).astype(int)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


print(f'The Accuracy is {accuracy:.4f}')
print(f'The Precision is {precision:.4f}')
print(f'The Recall is {recall:.4f}')
print(f'The F1 Score is {f1:.4f}')
print('The Confusion Matrix is')
print(conf_matrix)



The Accuracy is 0.9948
The Precision is 0.8667
The Recall is 0.5417
The F1 Score is 0.6667
The Confusion Matrix is
[[2474    2]
 [  11   13]]
