In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Load the dataset

In [None]:
data = pd.read_csv(r"C:\Users\USER\Downloads\credit_card_fraud_detection\creditcard.csv")

# Inspect the first few rows


In [None]:
print("First 5 rows of the dataset:")
print(data.head())

First 5 rows of the dataset:
   Time        V1        V2        V3  ...       V27       V28  Amount  Class
0   0.0 -1.359807 -0.072781  2.536347  ...  0.133558 -0.021053  149.62      0
1   0.0  1.191857  0.266151  0.166480  ... -0.008983  0.014724    2.69      0
2   1.0 -1.358354 -1.340163  1.773209  ... -0.055353 -0.059752  378.66      0
3   1.0 -0.966272 -0.185226  1.792993  ...  0.062723  0.061458  123.50      0
4   2.0 -1.158233  0.877737  1.548718  ...  0.219422  0.215153   69.99      0

[5 rows x 31 columns]


# Summary of the dataset

In [None]:
print("\nDataset summary:")
print(data.info())


Dataset summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float

# Check for missing values

In [None]:
print("\nMissing values in each column:")
print(data.isnull().sum())


Missing values in each column:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


# Class distribution

In [None]:
class_counts = data['Class'].value_counts()
print("\nClass distribution:")
print(class_counts)


Class distribution:
Class
0    284315
1       492
Name: count, dtype: int64


# Plot class distribution

In [None]:
plt.figure(figsize=(6, 4))
sns.barplot(x=class_counts.index, y=class_counts.values, palette="viridis")
plt.title('Class Distribution (0 = Genuine, 1 = Fraudulent)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

# Basic statistics for numerical features

In [None]:
print("\nBasic statistics:")
print(data.describe())


Basic statistics:
                Time            V1  ...         Amount          Class
count  284807.000000  2.848070e+05  ...  284807.000000  284807.000000
mean    94813.859575  1.168375e-15  ...      88.349619       0.001727
std     47488.145955  1.958696e+00  ...     250.120109       0.041527
min         0.000000 -5.640751e+01  ...       0.000000       0.000000
25%     54201.500000 -9.203734e-01  ...       5.600000       0.000000
50%     84692.000000  1.810880e-02  ...      22.000000       0.000000
75%    139320.500000  1.315642e+00  ...      77.165000       0.000000
max    172792.000000  2.454930e+00  ...   25691.160000       1.000000

[8 rows x 31 columns]


# Separate majority and minority classes

In [None]:
df_majority = data[data['Class'] == 0]
df_minority = data[data['Class'] == 1]

# Upsample minority class

In [None]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # Sample with replacement
                                 n_samples=len(df_majority),  # Match the number of majority class samples
                                 random_state=42)  # For reproducibility

# Combine the upsampled minority class with the majority class

In [None]:
# Combine the upsampled minority class with the majority class
data_balanced = pd.concat([df_majority, df_minority_upsampled])

# Check new class distribution

In [None]:
print("\nBalanced class distribution:")
print(data_balanced['Class'].value_counts())


Balanced class distribution:
Class
0    284315
1    284315
Name: count, dtype: int64


# Separate features and target variable

In [None]:
# Separate features and target variable
X = data_balanced.drop('Class', axis=1) # Features (drop the 'Class' column)
y = data_balanced['Class'] # Target variable

# Perform Feature scaling

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (80% train, 20% test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model with class balancing

In [None]:
model = LogisticRegression(class_weight='balanced', random_state=42)

# Train the model

In [None]:
model.fit(X_train, y_train)

LogisticRegression(class_weight='balanced', random_state=42)


# Perform Prediction on Test Sets

In [None]:
y_pred = model.predict(X_test)

# Evaluation - Classification report

In [None]:
# Generate the classification report
report = classification_report(y_test, y_pred, target_names=['Genuine', 'Fraudulent'])
print("Classification Report:")
print(report)

# Explanation:
# The classification report includes the precision, recall, f1-score, and support for each class.
# - Precision: The ratio of correctly predicted positive observations to the total predicted positives.
# - Recall: The ratio of correctly predicted positive observations to the all observations in actual class.
# - F1-Score: The weighted average of Precision and Recall.
# - Support: The number of actual occurrences of the class in the specified dataset.

Classification Report:
              precision    recall  f1-score   support

     Genuine       0.92      0.98      0.95     56746
  Fraudulent       0.98      0.92      0.95     56980

    accuracy                           0.95    113726
   macro avg       0.95      0.95      0.95    113726
weighted avg       0.95      0.95      0.95    113726



# Confusion Matrix

In [None]:
# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Plot the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Genuine', 'Fraudulent'], yticklabels=['Genuine', 'Fraudulent'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

Confusion Matrix:
[[55503  1243]
 [ 4521 52459]]


# Training a Random Forest Classifier

In [None]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Perform Prediction on Test Sets
y_pred_rf = rf_model.predict(X_test)


In [None]:

# Generate the classification report
rf_report = classification_report(y_test, y_pred_rf, target_names=['Genuine', 'Fraudulent'])
print("Random Forest Classification Report:")
print(rf_report)


In [None]:

# Generate the confusion matrix
rf_conf_matrix = confusion_matrix(y_test, y_pred_rf)
print("Random Forest Confusion Matrix:")
print(rf_conf_matrix)

# Plot the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(rf_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Genuine', 'Fraudulent'], yticklabels=['Genuine', 'Fraudulent'])
plt.title('Random Forest Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()