In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import joblib

In [3]:
data = pd.read_csv('creditcard.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
Q1 = data['Amount'].quantile(0.25)
Q3 = data['Amount'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data['Amount'] >= lower_bound) & (data['Amount'] <= upper_bound)]

In [5]:
data.dropna(inplace=True)

In [6]:
correlation_with_class = data.corr()['Class'].abs()
correlation_threshold = 0.1

In [7]:
low_correlation_features = correlation_with_class[correlation_with_class < correlation_threshold].index
data = data.drop(columns=low_correlation_features)

In [10]:
data.info()
X = data.drop('Class', axis=1)
y = data['Class']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252903 entries, 0 to 284805
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   V1      252903 non-null  float64
 1   V2      252903 non-null  float64
 2   V3      252903 non-null  float64
 3   V4      252903 non-null  float64
 4   V5      252903 non-null  float64
 5   V7      252903 non-null  float64
 6   V9      252903 non-null  float64
 7   V10     252903 non-null  float64
 8   V11     252903 non-null  float64
 9   V12     252903 non-null  float64
 10  V14     252903 non-null  float64
 11  V16     252903 non-null  float64
 12  V17     252903 non-null  float64
 13  V18     252903 non-null  float64
 14  Class   252903 non-null  int64  
dtypes: float64(14), int64(1)
memory usage: 30.9 MB


In [16]:
data.to_csv('treated_data.csv', index=False)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = pd.concat([X_train, y_train], axis=1)
train_data.to_csv('train_credit_card_transactions.csv', index=False)

test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('test_credit_card_transactions.csv', index=False)

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [14]:
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=42)

In [15]:
joblib.dump(model, 'fraud_detection_model.pkl')

['fraud_detection_model.pkl']