In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE

In [29]:
# Load the dataset
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [30]:
print(credit_card_data.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [31]:
print(credit_card_data.tail())

        Time        V1        V2        V3        V4        V5        V6  \
85254  60679  1.258467  0.407363  0.160962  1.034153  0.175374 -0.344634   
85255  60680  1.170321 -0.791258  0.341092 -0.498724 -1.234283 -1.024154   
85256  60681  0.826449 -0.246364 -0.220287  1.144915  0.119542  0.013537   
85257  60682  1.201303 -0.309688 -0.314367  0.167426  1.540306  4.059730   
85258  60684 -1.035410  1.278230  1.652904  0.574609 -0.222304  0.410386   

             V7        V8        V9  ...       V21       V22       V23  \
85254  0.227623 -0.227186 -0.203630  ... -0.081358 -0.162500 -0.154661   
85255 -0.282613 -0.248967 -0.867952  ... -0.577634 -1.495287  0.148887   
85256  0.454082 -0.030595 -0.481902  ...  0.142593  0.029994 -0.281769   
85257 -1.136824  1.090075  0.760974  ... -0.113604 -0.280918 -0.043147   
85258 -0.100852  0.859145 -0.699598  ... -0.104574 -0.408998 -0.128501   

            V24       V25       V26       V27       V28  Amount  Class  
85254 -0.429501  0.697308

In [32]:
print(credit_card_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85259 entries, 0 to 85258
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    85259 non-null  int64  
 1   V1      85259 non-null  float64
 2   V2      85259 non-null  float64
 3   V3      85259 non-null  float64
 4   V4      85259 non-null  float64
 5   V5      85259 non-null  float64
 6   V6      85259 non-null  float64
 7   V7      85259 non-null  float64
 8   V8      85259 non-null  float64
 9   V9      85259 non-null  float64
 10  V10     85259 non-null  float64
 11  V11     85259 non-null  float64
 12  V12     85259 non-null  float64
 13  V13     85259 non-null  float64
 14  V14     85259 non-null  float64
 15  V15     85259 non-null  float64
 16  V16     85259 non-null  float64
 17  V17     85259 non-null  float64
 18  V18     85259 non-null  float64
 19  V19     85259 non-null  float64
 20  V20     85259 non-null  float64
 21  V21     85259 non-null  float64
 22

In [33]:
print(credit_card_data.isnull().sum())

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       1
V28       1
Amount    1
Class     1
dtype: int64


In [34]:
print(credit_card_data['Class'].value_counts())

Class
0.0    85054
1.0      204
Name: count, dtype: int64


In [35]:
# Check for and handle missing values in the target column
if credit_card_data['Class'].isnull().sum() > 0:
    credit_card_data = credit_card_data.dropna(subset=['Class'])

In [36]:
# Separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [37]:
print(f"Legit transactions shape: {legit.shape}")
print(f"Fraudulent transactions shape: {fraud.shape}")

Legit transactions shape: (85054, 31)
Fraudulent transactions shape: (204, 31)


In [38]:
# Statistical measures of the data
print(legit.Amount.describe())
print(fraud.Amount.describe())

count    85054.000000
mean        98.365392
std        268.470093
min          0.000000
25%          7.690000
50%         27.000000
75%         89.800000
max      19656.530000
Name: Amount, dtype: float64
count     204.000000
mean      101.986814
std       229.008970
min         0.000000
25%         1.000000
50%         7.550000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64


In [39]:
# Compare the values for both transactions
print(credit_card_data.groupby('Class').mean())

               Time        V1        V2       V3        V4        V5  \
Class                                                                  
0.0    38709.224046 -0.247847 -0.049900  0.70098  0.151762 -0.267214   
1.0    34136.960784 -6.403734  4.412429 -8.46741  5.101621 -4.614515   

             V6        V7        V8        V9  ...       V20       V21  \
Class                                          ...                       
0.0    0.099795 -0.095774  0.047207 -0.013915  ...  0.041531 -0.031152   
1.0   -1.964040 -6.731357  2.942715 -3.057064  ...  0.391757  0.754409   

            V22       V23       V24       V25       V26       V27       V28  \
Class                                                                         
0.0   -0.105502 -0.037276  0.009236  0.133378  0.026126  0.000087  0.001840   
1.0   -0.150411 -0.240026 -0.093307  0.225443  0.088375  0.569295  0.040876   

           Amount  
Class              
0.0     98.365392  
1.0    101.986814  

[2 rows x 30 col

In [40]:
# Create a balanced dataset using SMOTE
X = credit_card_data.drop(columns='Class', axis=1)
Y = credit_card_data['Class']

In [41]:
# Check for and handle missing values in the feature columns
if X.isnull().sum().sum() > 0:
    X = X.fillna(X.mean())

In [42]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [43]:
# Split the data
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, stratify=Y, random_state=2)

In [44]:
# Handle imbalance using SMOTE
smote = SMOTE(random_state=2)
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train)

In [45]:
print(f"Original X shape: {X.shape}, Resampled X shape: {X_train_resampled.shape}")
print(f"Original Y shape: {Y.shape}, Resampled Y shape: {Y_train_resampled.shape}")

Original X shape: (85258, 30), Resampled X shape: (136086, 30)
Original Y shape: (85258,), Resampled Y shape: (136086,)


In [46]:
# Logistic Regression model
model = LogisticRegression(max_iter=1000)

In [47]:
# Train the model
model.fit(X_train_resampled, Y_train_resampled)

In [48]:
# Accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print(f'Accuracy on Training data: {training_data_accuracy}')

Accuracy on Training data: 0.9793126704395507


In [49]:
# Accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print(f'Accuracy score on Test Data: {test_data_accuracy}')

Accuracy score on Test Data: 0.9794745484400657


In [50]:
# Additional evaluation metrics
precision = precision_score(Y_test, X_test_prediction)
recall = recall_score(Y_test, X_test_prediction)
f1 = f1_score(Y_test, X_test_prediction)
conf_matrix = confusion_matrix(Y_test, X_test_prediction)

In [51]:
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

Precision: 0.10077519379844961
Recall: 0.9512195121951219
F1 Score: 0.1822429906542056
Confusion Matrix:
[[16663   348]
 [    2    39]]


In [65]:
import joblib
# Save the model to disk
joblib.dump(model, 'credit_card_fraud_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [66]:
import joblib

# Load the scaler
scaler = joblib.load('scaler.pkl')

# Check the number of features the scaler expects
print(scaler.n_features_in_)

# Load the model
model = joblib.load('credit_card_fraud_model.pkl')

# Check the number of features the model expects
print(model.n_features_in_)


30
30


In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import joblib

In [70]:
# Load the dataset
data = pd.read_csv('/content/creditcard.csv')

In [71]:
# Select only the necessary features
features = ['V1', 'V2', 'V3', 'V4', 'Amount']
X = data[features]
y = data['Class']

In [72]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [73]:
# Train the model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [75]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

In [77]:
# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9987


In [78]:
# Print classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Print confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.80      0.33      0.46        98

    accuracy                           1.00     56962
   macro avg       0.90      0.66      0.73     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
[[56856     8]
 [   66    32]]


In [79]:
# Save the model and scaler
joblib.dump(model, 'credit_card_fraud.pkl')
joblib.dump(scaler, 'model.pkl')

['model.pkl']