In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_csv('/kaggle/input/credircard/creditcard.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.hist(bins=30,figsize=(12,10), grid=False )
plt.suptitle('Histograms of Columns')
plt.show()

In [None]:
labels = df.Class.unique()

sizes = df.Class.value_counts().values

fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct='%1.3f%%')
ax.set_title('Target Variable Value Counts')
plt.show()
print(df.Class.value_counts())

In [None]:
plt.figure(figsize=(8, 6))
sns.boxenplot(data=df, x='Class', y='Amount')
plt.title('Transaction Amount by Class ')
plt.xlabel('Class')
plt.ylabel('Transaction Amount')
plt.show()

In [None]:
plt.hist(df.Amount.values, 6, histtype='bar', facecolor='g')
plt.show()

print("Minimum amount value is ", np.min(df.Amount.values))
print("Maximum amount value is ", np.max(df.Amount.values))
print("90% of the transactions have an amount less or equal than ", np.percentile(df.Amount.values, 90))

In [None]:
fraud = df[df['Class'] == 1]
plt.scatter(fraud['Time'], fraud['Amount'], alpha=0.5)
plt.title('Fraudulent Transactions Over Time')
plt.xlabel('Time (seconds)')
plt.ylabel('Transaction Amount')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Amount', hue='Class', bins=50)
plt.title('Distribution of Transaction Amount by Class')
plt.yscale('log')
plt.show()

In [None]:
# Time distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Time', hue='Class', bins=50)
plt.title('Distribution of Time by Class')
plt.show()

In [None]:
# Correlation matrix for important features
plt.figure(figsize=(12, 8))
correlation_matrix = df[['Amount', 'Time', 'Class'] + [f'V{i}' for i in range(1, 5)]].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Selected Features')
plt.show()

In [None]:
df.info()

In [None]:
# standardize features 
from sklearn.preprocessing import StandardScaler
df.iloc[:, 1:30] = StandardScaler().fit_transform(df.iloc[:, 1:30])
data_matrix = df.values

# Separate features and target
X = data_matrix[:, 1:30]
y = data_matrix[:, 30]


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Original training set shape:", dict(zip(*np.unique(y_train, return_counts=True))))
print('\n')
print('X.shape=', X.shape, 'y.shape=', y.shape)

In [None]:
print('X_train.shape=', X_train.shape, 'Y_train.shape=', y_train.shape)
print('X_test.shape=', X_test.shape, 'Y_test.shape=', y_test.shape)

In [None]:
# DecisionTreeClassifier model
dt_model = DecisionTreeClassifier(max_depth=4, random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Make predictions
y_pred = dt_model.predict(X_test)

# Evaluate the accuracy
accuracy = dt_model.score(X_test, y_test)
print("DecisionTreeClassifier Accuracy: {0:.5f}".format(accuracy))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for Decision Tree Classifier')
plt.show()

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the accuracy
accuracy_rf = rf_model.score(X_test, y_test)
print("RandomForestClassifier Accuracy: {0:.5f}".format(accuracy_rf))

In [None]:
svm_model = SVC(C=1.0, random_state=42)  # dual=False für große Datensätze empfohlen
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = svm_model.score(X_test, y_test)

print("SVC Accuracy: {0:.5f}".format(accuracy_svm))

In [None]:
conf_matrix_dt = confusion_matrix(y_test, y_pred_svm)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_dt, annot=True, fmt='d', cmap='Blues', xticklabels=['Legitimate', 'Fraudulent'], yticklabels=['Legitimate', 'Fraudulent'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('SVM Confusion Matrix')
plt.show()

In [None]:
logreg_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
logreg_model.fit(X_train, y_train)

# Make predictions
y_pred_logreg = logreg_model.predict(X_test)

# Evaluate the accuracy
accuracy_logreg = logreg_model.score(X_test, y_test)
print("LogisticRegression Accuracy: {0:.5f}".format(accuracy_logreg))


In [None]:
model_names = ['Decision Tree', 'SVM', 'Random Forest', 'Logistic Regression']
accuracies = [accuracy, accuracy_svm, accuracy_rf, accuracy_logreg]

# Plotting the accuracies
plt.figure(figsize=(10, 6))
plt.bar(model_names, accuracies, color=['blue', 'green', 'red', 'purple'])
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Accuracy of Different Models')
plt.ylim(0.99, 1.0)  # Adjust the y-axis to better visualize the differences
plt.show()

In [None]:
feature_importance = pd.DataFrame({
    'feature': df.columns[1:30],  # Exclude 'Time' and 'Class' columns
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
plt.title('Top 10 Most Important Features')
plt.show()

In [None]:
from sklearn.metrics import classification_report

# Generate classification reports for each model
report_dt = classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraudulent'])
report_svm = classification_report(y_test, y_pred_svm, target_names=['Legitimate', 'Fraudulent'])
report_rf = classification_report(y_test, y_pred_rf, target_names=['Legitimate', 'Fraudulent'])
report_logreg = classification_report(y_test, y_pred_logreg, target_names=['Legitimate', 'Fraudulent'])

# Print the reports
print("Decision Tree Classifier Report:\n", report_dt)
print("SVM Classifier Report:\n", report_svm)
print("Random Forest Classifier Report:\n", report_rf)
print("Logistic Regression Classifier Report:\n", report_logreg)