In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [4]:
df = pd.read_csv('creditcard.csv')

In [8]:
# Explore the dataset
print(df.head())
print(df.tail())
print(df.info())
print(df.describe())
print(df.isnull().sum())

# Handle missing values (if any)
df.fillna(df.mean(), inplace=True)

# Normalize the data using StandardScaler
scaler = StandardScaler()
df[['Amount']] = scaler.fit_transform(df[['Amount']])

# Convert the 'Class' column to binary (0 for genuine, 1 for fraudulent)
df['Class'] = df['Class'].map({0: 0, 1: 1})

# Split the dataset into features (X) and target (y)
X = df.drop('Class', axis=1)
y = df['Class']

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [12]:
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [14]:
legit = df[df.Class == 0]
fraud = df[df.Class == 1]

In [16]:
print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [18]:
legit.Amount.describe()

count    284315.000000
mean         -0.000234
std           0.999942
min          -0.353229
25%          -0.330640
50%          -0.265271
75%          -0.045177
max         102.362243
Name: Amount, dtype: float64

In [20]:
fraud.Amount.describe()

count    492.000000
mean       0.135382
std        1.026242
min       -0.353229
25%       -0.349231
50%       -0.316247
75%        0.070128
max        8.146182
Name: Amount, dtype: float64

In [22]:
df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,-0.000234
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,0.135382


In [24]:
legit_sample = legit.sample(n=492)

In [26]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [28]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
101817,67952.0,1.237358,-0.475906,-0.900726,-1.031389,1.594505,3.390775,-0.878635,0.866235,0.307419,...,0.050671,-0.062168,-0.091436,1.084677,0.37971,1.081356,-0.066042,0.011158,-0.161921,0
37012,38767.0,0.783044,-0.949829,0.794007,0.628609,-0.993433,0.602556,-0.6754,0.380413,0.788266,...,0.244274,0.381782,-0.219324,-0.263672,0.154086,0.590405,-0.026416,0.035432,0.406407,0
43700,41618.0,-0.938812,0.654799,2.657016,0.407784,-0.679043,0.676993,-0.136137,0.468805,0.08377,...,0.017777,0.270653,-0.161078,0.262082,-0.006623,0.45696,0.200413,0.128998,-0.2474,0
195237,130963.0,1.301017,-2.555217,-0.852939,-0.677952,-1.397279,0.443054,-0.820247,-0.014419,0.332403,...,0.49182,0.435598,-0.207033,0.083415,-0.479595,-0.272441,-0.048666,0.046872,1.534427,0
222339,142935.0,2.018794,0.116487,-1.730398,0.546102,0.120628,-1.44952,0.241287,-0.341999,0.674318,...,0.204054,0.710348,-0.020169,-0.091803,0.213339,-0.09803,-0.005774,-0.03027,-0.278985,0


In [30]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.88285,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.29268,0.147968,1.206024,1
280143,169347.0,1.378559,1.289381,-5.004247,1.41185,0.442581,-1.326536,-1.41317,0.248525,-1.127396,...,0.370612,0.028234,-0.14564,-0.081049,0.521875,0.739467,0.389152,0.186637,-0.350191,1
280149,169351.0,-0.676143,1.126366,-2.2137,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.65225,...,0.751826,0.834108,0.190944,0.03207,-0.739695,0.471111,0.385107,0.194361,-0.041818,1
281144,169966.0,-3.113832,0.585864,-5.39973,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.2537,0.626302,1
281674,170348.0,1.991976,0.158476,-2.583441,0.40867,1.151147,-0.096695,0.22305,-0.068384,0.577829,...,-0.16435,-0.295135,-0.072173,-0.450261,0.313267,-0.289617,0.002988,-0.015309,-0.183191,1


In [32]:
new_dataset['Class'].value_counts()

Class
0    492
1    492
Name: count, dtype: int64

In [34]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,93726.414634,0.012338,-0.053789,-0.006049,-0.015881,-0.047503,0.115137,-0.069346,0.07314,-0.012538,...,0.07766,0.000531,-0.020627,-0.009285,0.017704,0.00798,-0.04135,-0.003648,0.026436,0.063455
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,0.135382


In [38]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [40]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
101817   67952.0  1.237358 -0.475906 -0.900726 -1.031389  1.594505  3.390775   
37012    38767.0  0.783044 -0.949829  0.794007  0.628609 -0.993433  0.602556   
43700    41618.0 -0.938812  0.654799  2.657016  0.407784 -0.679043  0.676993   
195237  130963.0  1.301017 -2.555217 -0.852939 -0.677952 -1.397279  0.443054   
222339  142935.0  2.018794  0.116487 -1.730398  0.546102  0.120628 -1.449520   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [44]:
print(Y)

101817    0
37012     0
43700     0
195237    0
222339    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [48]:
print(X.shape, X_train.shape, X_test.shape)

(984, 30) (787, 30) (197, 30)


In [50]:
model = LogisticRegression()

In [52]:
model.fit(X_train, Y_train)

In [10]:
# Check the class distribution
print(y.value_counts())

# Use SMOTE to oversample the minority class (fraudulent transactions)
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Check the new class distribution
print(y_res.value_counts())

Class
0    284315
1       492
Name: count, dtype: int64
Class
0    284315
1    284315
Name: count, dtype: int64


In [9]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

In [None]:
# Train a logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Train a random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
# Evaluate the logistic regression model
y_pred_log = log_reg.predict(X_test)
print("Logistic Regression:")
print("Precision:", precision_score(y_test, y_pred_log))
print("Recall:", recall_score(y_test, y_pred_log))
print("F1-score:", f1_score(y_test, y_pred_log))
print("Classification Report:")
print(classification_report(y_test, y_pred_log))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log))

# Evaluate the random forest classifier
y_pred_rf = rf.predict(X_test)
print("Random Forest:")
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1-score:", f1_score(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

In [32]:
import joblib
df = pd.read_csv('creditcard.csv')

# Preprocess the data
X = df.drop(['Class'], axis=1)
y = df['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a random forest classifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Model accuracy:", accuracy_score(y_test, y_pred))

# Save the trained model to a file
joblib.dump(model, 'model.pkl', protocol=4)

Model accuracy: 0.9995611109160493


['model.pkl']

In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import joblib

# Load the dataset
# Assuming 'creditcard.csv' is the dataset file
df = pd.read_csv('creditcard.csv')

# Define features and target
X = df.drop(columns='Class')
y = df['Class']

# Preprocessing pipelines for numerical data
numerical_features = X.columns  # Assuming all columns are numerical
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ])

# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Handle class imbalance using SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Save the model to a file
joblib.dump(model, 'fraud_detection_model.pkl', protocol=4)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56750
           1       1.00      1.00      1.00     56976

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726



['fraud_detection_model.pkl']

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load the dataset
data = pd.read_csv('creditcard.csv')

# Handle missing values (if any)
data.fillna(method='ffill', inplace=True)

# Normalize the data
scaler = StandardScaler()
data[['Amount', 'Time']] = scaler.fit_transform(data[['Amount', 'Time']])

# Handle class imbalance using SMOTE
X = data.drop('Class', axis=1)
y = data['Class']
smote = SMOTE()
X_res, y_res = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)


  data.fillna(method='ffill', inplace=True)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Train logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Train random forest classifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)


In [5]:
from sklearn.metrics import classification_report

# Evaluate logistic regression model
y_pred_log_reg = log_reg.predict(X_test)
log_reg_report = classification_report(y_test, y_pred_log_reg)

# Evaluate random forest classifier
y_pred_rf_clf = rf_clf.predict(X_test)
rf_clf_report = classification_report(y_test, y_pred_rf_clf)


In [7]:
import streamlit as st

# Title of the web app
st.title("Credit Card Fraud Detection")

# Display data
st.subheader("Data Sample")
st.write(data.sample(5))

# Display model evaluation
st.subheader("Logistic Regression Model Evaluation")
st.text(log_reg_report)

st.subheader("Random Forest Classifier Model Evaluation")
st.text(rf_clf_report)

# Allow users to input new transaction data
st.subheader("Predict New Transaction")
amount = st.number_input('Transaction Amount')
time = st.number_input('Transaction Time')
# Add inputs for other features as needed

if st.button('Predict'):
    input_data = [[amount, time]]  # Extend with other features
    input_data = scaler.transform(input_data)
    prediction = log_reg.predict(input_data)
    st.write('Fraudulent' if prediction[0] else 'Genuine')
