<a href="https://colab.research.google.com/github/Dan-Analyst/DATA_ANALYSIS_-PROJECTS/blob/main/Frauddetect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
path = "/content/drive/MyDrive/credit card fraud detection/creditcard.csv"
data = pd.read_csv(path)



In [5]:
# Basic information about the dataset
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [6]:
# Check for missing values
print("Missing values in each column:\n", data.isnull().sum())

Missing values in each column:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [7]:
# Check for duplicates
duplicates = data.duplicated().sum()
print("Number of duplicate entries:", duplicates)

Number of duplicate entries: 1081


In [8]:
# Drop duplicates if any
if duplicates > 0:
    data = data.drop_duplicates()


In [9]:
# Statistical summary to check for anomalies
print(data.describe())

                Time             V1             V2             V3  \
count  283726.000000  283726.000000  283726.000000  283726.000000   
mean    94811.077600       0.005917      -0.004135       0.001613   
std     47481.047891       1.948026       1.646703       1.508682   
min         0.000000     -56.407510     -72.715728     -48.325589   
25%     54204.750000      -0.915951      -0.600321      -0.889682   
50%     84692.500000       0.020384       0.063949       0.179963   
75%    139298.000000       1.316068       0.800283       1.026960   
max    172792.000000       2.454930      22.057729       9.382558   

                  V4             V5             V6             V7  \
count  283726.000000  283726.000000  283726.000000  283726.000000   
mean       -0.002966       0.001828      -0.001139       0.001801   
std         1.414184       1.377008       1.331931       1.227664   
min        -5.683171    -113.743307     -26.160506     -43.557242   
25%        -0.850134      -0.6898

In [10]:
# Feature scaling
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data[['Amount']])  # Scaling the 'Amount' feature

In [11]:
# Split the data into features and target
X = data.drop(['Class', 'Time'], axis=1)  # Dropping 'Time' as it may not be relevant
y = data['Class']

In [12]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_predictions = log_reg.predict(X_test)


In [14]:
# Evaluation
print("Logistic Regression Classification Report:")
print(classification_report(y_test, log_predictions))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.88      0.54      0.67        90

    accuracy                           1.00     56746
   macro avg       0.94      0.77      0.84     56746
weighted avg       1.00      1.00      1.00     56746



In [15]:
# Decision Tree model for comparison
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_predictions = tree_model.predict(X_test)


In [16]:
# Evaluation
print("Decision Tree Classification Report:")
print(classification_report(y_test, tree_predictions))


Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.72      0.73      0.73        90

    accuracy                           1.00     56746
   macro avg       0.86      0.87      0.86     56746
weighted avg       1.00      1.00      1.00     56746



In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.ensemble import RandomForestClassifier


In [18]:
data['Amount'] = StandardScaler().fit_transform(data[['Amount']])
data['Time'] = data['Time'].apply(lambda x: np.cos(x * 2 * np.pi / 86400))


In [22]:
# Feature Engineering: Interaction terms (example)
data['Amount_to_mean'] = data['Amount'] / data.groupby('Class')['Amount'].transform('mean')


In [23]:
# Splitting the dataset
X = data.drop(['Class'], axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:

from tensorflow.keras.layers import Dense, Dropout, Input

# Neural Network Model using an Input layer
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Use Input layer to specify input shape
    Dense(16, activation='relu'),
    Dropout(0.1),
    Dense(16, activation='relu'),
    Dropout(0.1),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=256, verbose=1, validation_split=0.2)


Epoch 1/10
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8419 - loss: 25.1838 - val_accuracy: 0.9981 - val_loss: 0.0031
Epoch 2/10
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9966 - loss: 0.0419 - val_accuracy: 0.9982 - val_loss: 0.0027
Epoch 3/10
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.9974 - loss: 0.0162 - val_accuracy: 0.9981 - val_loss: 0.0025
Epoch 4/10
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9981 - loss: 0.0093 - val_accuracy: 0.9982 - val_loss: 0.0022
Epoch 5/10
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9987 - loss: 0.0040 - val_accuracy: 0.9996 - val_loss: 0.0018
Epoch 6/10
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9993 - loss: 0.0033 - val_accuracy: 0.9997 - val_loss: 0.0015
Epoch 7/10
[1m710/710[0m 

<keras.src.callbacks.history.History at 0x7ff513dd6310>

In [26]:
# Predictions and evaluation
nn_predictions = (model.predict(X_test) > 0.5).astype(int)
print("Neural Network Classification Report:")
print(classification_report(y_test, nn_predictions))

[1m1774/1774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step  
Neural Network Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.97      0.86      0.91        90

    accuracy                           1.00     56746
   macro avg       0.99      0.93      0.96     56746
weighted avg       1.00      1.00      1.00     56746

