<a href="https://colab.research.google.com/github/thomaschen01/NTCU-Machine-Learning/blob/main/ACS111106_ex1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from google.colab import drive

In [9]:
drive.mount('/content/drive')
path = "/content/drive/MyDrive/creditcard.csv"
data = pd.read_csv(path)
data['Class'] = data['Class'].astype(int)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]

print(f'Fraudulent: {len(fraud)}, non-fraudulent: {len(nonfraud)}')
print(f'The positive class (frauds) percentage: {len(fraud)/(len(data))*100:.3f}%')

X = data.drop(columns=['Class']).values
Y = data['Class'].values

TEST_SIZE = 0.3
RANDOM_SEED = 42

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y)

Fraudulent: 492, non-fraudulent: 284315
The positive class (frauds) percentage: 0.173%


In [11]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=4,
    class_weight='balanced',
    random_state=RANDOM_SEED
)
rf_model.fit(X_train, y_train)

def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

y_pred = rf_model.predict(X_test)
evaluation(y_test, y_pred, "Random Forest (Supervised)")



Random Forest (Supervised) Evaluation:
         Accuracy: 0.9992743700478681
  Precision Score: 0.7986111111111112
     Recall Score: 0.777027027027027
         F1 Score: 0.7876712328767124

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.80      0.78      0.79       148

    accuracy                           1.00     85443
   macro avg       0.90      0.89      0.89     85443
weighted avg       1.00      1.00      1.00     85443



In [13]:
scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)
n_x_train = x_train[y_train == 0][:3000]

scores = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
    kmeans.fit(n_x_train)
    score = silhouette_score(n_x_train, kmeans.labels_)
    scores.append(score)

optimal_k = np.argmax(scores) + 2
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(n_x_train)
y_pred_test = kmeans.predict(x_test)

def align_labels(y_true, y_pred, n_clusters):
    labels = np.zeros_like(y_pred)
    for i in range(n_clusters):
        mask = (y_pred == i)
        if np.sum(mask) > 0:
            labels[mask] = np.bincount(y_true[mask]).argmax()
        else:
            labels[mask] = 0
    return labels

y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)
evaluation(y_test, y_pred_aligned, model_name="KMeans (Unsupervised)")


KMeans (Unsupervised) Evaluation:
         Accuracy: 0.9982678510820079
  Precision Score: 0.0
     Recall Score: 0.0
         F1 Score: 0.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.00      0.00      0.00       148

    accuracy                           1.00     85443
   macro avg       0.50      0.50      0.50     85443
weighted avg       1.00      1.00      1.00     85443



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
