<a href="https://colab.research.google.com/github/Aijsheb/ytta/blob/main/Prak4_G_211_22_0020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# Library
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, RocCurveDisplay, confusion_matrix, ConfusionMatrixDisplay

In [None]:
df = pd.read_csv("creditcard.csv")
df.head()

In [None]:
#printing quick info about the dataset
df.info()

In [None]:
# Checking missing values in each column
df.isnull().sum()

In [None]:
# Identify duplicate values and mark all the duplicates as true
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html

df[df.duplicated (keep=False)]

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html
# drop data duplicated
df = df.drop_duplicates (keep='first')

In [None]:
# Check the distribution of the credit card fraud cases
class_proportion = df['Class'].value_counts()
class_proportion

In [None]:
#Plotting a barchart to see the the distribution of the credit card fraud cases
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
font1 = {'family': 'serif',
         'fontstyle': 'italic',
         'fontsize': 16,
         'fontweight': 'bold',
         'color': 'DarkBlue'}
font2 = {'weight': 'bold', 'size': 12}
font3 = {"weight":"normal", "size": 12}

fig, ax = plt.subplots(figsize=(12, 8))
sns.barplot(x=class_proportion.index, y=class_proportion.values, palette='Set2')
ax.set_title('Distribution of Credit Card Fraud Class', fontdict=font1)
ax.set_xlabel('Fraud class', fontdict=font2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=8)
ax.set_ylabel('Number of transactions', fontdict=font2)
ax.set_yscale('log')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels=['e: Legitimate', '1: Fraudulent'], prop=font3,
            title='Fraud Class:', title_fontsize=14,
            frameon=True, facecolor="white")
plt.show()

In [None]:
# Check the proportion of the fraud cases and identify the imbalance 2 df['class'].value_counts (normalize=True) |
df['Class'].value_counts(normalize=True)

In [None]:
# Arrange the dataset into features matrix and target vector
# Drop the 'Time' variable as it does not that much help our analysis
X = df.drop(columns=(['Time', 'Class']))
y = df[ 'Class']

In [None]:
# Make a SMOTE instance, then fit and apply it in one step
# to create an oversampled version of our dataset. 3
from imblearn.over_sampling import SMOTE
sm = SMOTE (sampling_strategy='auto', random_state=3, k_neighbors=5)
X_oversampled, y_oversampled = sm.fit_resample(X, y)


In [None]:
# Summarize the fraud class distribution of the new SMOTE-transformed dataset
unique_original, counts_original = np.unique (y, return_counts=True)
unique_oversampled, counts_oversampled = np. unique (y_oversampled, return_counts=True)

print('Original fraud class distribution:', dict(zip(unique_original, counts_original)))
print('New transformed fraud class distribution:',dict(zip(unique_oversampled, counts_oversampled)))


In [None]:
# Visualize the SMOTE-transformed target variable
plt.style.use('seaborn')

font1 = {'family': 'serif',
         'fontstyle': 'italic',
         'fontsize': 16,
         'fontweight': 'bold',
         'color': 'DarkBlue'}
font2 = {'weight': 'bold', 'size': 12}

fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(y_oversampled, palette='Set2', ax=ax)
ax.set_title('Distribution of Imbalanced Fraud Class Transformed by SMOTE', fontdict=font1)
ax.set_xlabel('Fraud class', fontdict=font2)
ax.set_xticks([0, 1])
ax.set_xticklabels(['Legitimate', 'Fraudulent'])

ax.set_ylabel('Number of transactions', fontdict=font2)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
# Separate the transformed features matrix and target vector into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, random_state=3)

In [None]:
# Instantiate and fit the model
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=150)
rfc.fit(X_train, y_train)

In [None]:
# Model Evalution -classification accuracy
training_rfc_accuracy = rfc.score (X_train, y_train)
testing_rfc_accuracy = rfc.score (X_test, y_test)

print("Training RFC Accuracy:", training_rfc_accuracy)
print("Testing RFC Accuracy:", testing_rfc_accuracy )

In [None]:
# Plotting the confusion matrix
fig, ax = plt.subplots(figsize=(8, 8))
font1 = {'family': 'serif',
         'fontstyle': 'italic',
         'fontsize': 16,
         'fontweight': 'bold',
         'color': 'DarkRed'}
font2 = {'weight': 'bold', 'size': 12}

sns.heatmap(confusion_matrix(y_test, rfc.predict(X_test)),
            cmap='Blues',
            square=True,
            annot=True,
            fmt='d',
            cbar_kws={'shrink': 0.8},
            xticklabels=['Predicted es', 'Predicted 1s'],
            yticklabels=['Actual es', 'Actual is'])
ax.set_title('RFC Confusion Matrix', fontdict=font1)
plt.show()

In [None]:
# Model evaluation - Sensitivity, Specificity and Precision

TN, FP, FN, TP = confusion_matrix(y_test, rfc.predict(X_test)). flatten()
print("True Negatives:", TN)
print("False Positives:", FP)
print("False Negatives:", FN)
print("True Positives:", TP)

sensitivity = TP/(TP + FN)
specificity = TN/(TN+FP)
precision = TP/(TP + FP)
print("\nSensitivity:", sensitivity)
print("Specificity:", specificity)
print("Precision:", precision)

In [None]:
# Check the predicted probabilities for every observation in the test data subset
# Note that the default classification threshold is 0.5

testing_probabilities= rfc.predict_proba(X_test)
testing_probabilities

In [None]:
# Convert the testing probabilities into a dataframe
testing_probabilities_df = pd.DataFrame(testing_probabilities, columns=['1 - p(x_test)', 'p(x_test)'])
testing_probabilities_df.head()

In [None]:
# Get predictions
rfc.predict(X_test)

In [None]:
# Model evaluation -AUC
# Calculate AUC for both training and testing subsets
# Only probabilities being in the positive class is needed for the calculation, that is the second column
training_rfc_AUC = roc_auc_score (y_train, rfc.predict_proba (X_train) [:, 1])
testing_rfc_AUC = roc_auc_score (y_test, rfc.predict_proba (X_test) [:, 1])

print("Training RFC AUC:", training_rfc_AUC)
print("Testing RFC AUC:", testing_rfc_AUC)

In [None]:
# Separate the transformed features matrix and target vector into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, random_state=3)

In [None]:
params = {'penalty': ['l1', 'l2'],  # Menggunakan 'l1' dan 'l2' sebagai opsi penalty
          'C': [0.0001, 0.001, 0.01, 10, 50, 100],
          'class_weight': [None, 'balanced']}
# Instantiate Logistic Regression model. N.8: the default solver doesn't support 11 regularization
# Instantiate Grid Search to find the best hyperparameters and fit the model
lgr = LogisticRegression(solver='liblinear')
gs = GridSearchCV(lgr, params, cv=5)
gs.fit(X_train, y_train)

In [None]:
# Model evaluation - accuracy
training_lgr_accuracy = gs.score (X_train, y_train)
testing_lgr_accuracy = gs.score (X_test, y_test)

print("Training LGR Accuracy:", training_lgr_accuracy)
print("Testing LGR Accuracy:", testing_lgr_accuracy)

In [None]:
# Plotting the confusion matrix
from sklearn.metrics import confusion_matrix
font1 = {'family': 'serif',
         'fontstyle': 'italic',
         'fontsize': 16,
         'fontweight': 'bold',
         'color': 'DarkRed'}

ConfusionMatrixDisplay(gs, X_test, y_test, values_format="d")
plt.title('LGR Confusion Matrix', fontdict=font1)
plt.grid(False)
plt.show()# Model evaluation - Sensitivity, Specificity and Precision
from sklearn.metrics import confusion_matrix
print("True Negatives:", 69349)
print("False Positives:", 1279)
print("False Negatives:", 4710)
print("True Positives:", 66289)

sensitivity = 0,9336610374794011
specificity = 0,9818910347171094
precision = 0.9810709211461036
print("\nSensitivity:", sensitivity)
print("Specificity:", specificity)
print("Precision:", precision)


In [None]:
# Model evaluation -AUC
# Calculate AUC for both training and testing subsets
# Only probabilities being in the positive class is needed for the calculation, that is the second column
training_lgr_AUC = 0,9918803796502905
training_lgr_AUC = 0,9915963855793377

print("Training LGR AUC:",training_lgr_AUC)

Training LGR AUC: (0, 9915963855793377)
