<a href="https://www.kaggle.com/code/gmgoes/notebook20bd4f0fb2-creditcardfraud?scriptVersionId=136131694" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# Libraries
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from collections import Counter
from statistics import mean;
import pandas as pd
import numpy as np
import random
import os
print('OK');

In [None]:
# Getting the dataset
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv', sep = ',');
y = df['Class'];
y = y.to_numpy();
X = df.drop(columns = 'Class');
X = X.to_numpy();
print('OK');

### TODO:
   - Do tests with OverSampler
        - 'minority': resample only the minority class; ✅
        - 'not minority': resample all classes but the minority class; ✅
        - 'not majority': resample all classes but the majority class; ✅
        - 'all': resample all classes; ✅
        - 'auto': equivalent to 'not majority'. ✅
   - Do tests with UnderSampler
        - 'majority': resample only the majority class; ✅
        - 'not minority': resample all classes but the minority class; ✅
        - 'not majority': resample all classes but the majority class; ✅
        - 'all': resample all classes; ✅
        - 'auto': equivalent to 'not minority ✅
        
   - Do tests with SMOTE
        - 'minority': resample only the minority class; ✅
        - 'not minority': resample all classes but the minority class; ✅
        - 'not majority': resample all classes but the majority class; ✅
        - 'all': resample all classes; ✅
        - 'auto': equivalent to 'not majority'.✅

In [None]:
# RandomOverSampler - Grows our data to balance the amount of data we have for each class
ros = RandomOverSampler(random_state = 42, sampling_strategy = 'minority');
# ros = RandomOverSampler(random_state = 42, sampling_strategy = 'not minority');
# ros = RandomOverSampler(random_state = 42, sampling_strategy = 'not majority');
# ros = RandomOverSampler(random_state = 42, sampling_strategy = 'all');
X_over, y_over = ros.fit_resample(X, y);
freq_class = sorted(Counter(y).items());
freq_classO = sorted(Counter(y_over).items());
print(freq_class);
print(freq_classO);

In [None]:
# RandomUnderSampler - Shrink our data to balance the amount of data we have for each class
# rus = RandomUnderSampler(random_state = 42, sampling_strategy = 'majority');
# rus = RandomUnderSampler(random_state = 42, sampling_strategy = 'not minority');
# rus = RandomUnderSampler(random_state = 42, sampling_strategy = 'not majority');
rus = RandomUnderSampler(random_state = 42, sampling_strategy = 'all');
X_under, y_under = rus.fit_resample(X, y);
freq_class = sorted(Counter(y).items());
freq_classU = sorted(Counter(y_under).items());
print(freq_class);
print(freq_classU);

In [None]:
# SMOTE - Synthetic Minority Over-sampling Technique
sm = SMOTE(random_state = 42, sampling_strategy = 'minority');
# sm = SMOTE(random_state = 42, sampling_strategy = 'not minority');
# sm = SMOTE(random_state = 42, sampling_strategy = 'not majority');
# sm = SMOTE(random_state = 42, sampling_strategy = 'all');
X_smote, y_smote = sm.fit_resample(X, y);
freq_class = sorted(Counter(y).items());
freq_classS = sorted(Counter(y_smote).items());
print(freq_class);
print(freq_classS);

In [None]:
plt.figure(figsize = (40, 40))
plt.plot(X_smote[y_smote == 0,0],X_smote[y_smote == 0,1],'.r');
plt.plot(X_smote[y_smote == 1,0],X_smote[y_smote == 1,1],'.b');
#plt.plot(X_over[y_over == 0,0],X_over[y_over == 0,1],'.r');
#plt.plot(X_over[y_over == 1,0],X_over[y_over == 1,1],'.b');
#plt.plot(X_under[y_under == 0,0],X_under[y_under == 0,1],'.r');
#plt.plot(X_under[y_under == 1,0],X_under[y_under == 1,1],'.b');
plt.plot(X[y == 1,0], X[y == 1,1],'dy')
plt.show();

In [None]:
def aleatory_generator():   # function to get some random data from our already balanced dataset
    aleatory_data = [];
    aleatory_target = [];
    for i in range(500):
        aleatory_position = random.randint(1, len(X_smote));
        aleatory_data.append(X_smote[aleatory_position]);
        aleatory_target.append(y_smote[aleatory_position]);
    return aleatory_data, aleatory_target ;

aleatoryd, aleatoryt = aleatory_generator();
print('OK');

In [None]:
# KNN - Classifier implementing the K-Nearest Neighbors vote.
neigh = KNeighborsClassifier(n_neighbors = 3); # using KNN with neighbors = 3
# neigh.fit(X_over, y_over); # With Over
#neigh.fit(X_under, y_under); # With Under
neigh.fit(X_smote, y_smote); # With Smote
#print(neigh.predict(aleatoryd); # what our algorithm is predicting it to be
#print(neigh.predict_proba(aleatoryd); #the probability of being the one he guessed
print(f"{(neigh.score(aleatoryd, aleatoryt)):.1%}")

## Results of tests with each type of balancing method:
    - RandomOverSampler
        - Minority = 70.0%
        - Not Minority = 61.2%
        - Not Majority = 70.0%
        - All = 70.0%
        
    - RandomUnderSampler
        - Majority = 71.4%
        - Not Minority = 71.4%
        - Not Majority = 61.2%
        - All = 71.4%
        
    - SMOTE
        - Minority = 98.4%
        - Not Minority = 61.2%
        - Not Majority = 98.4%
        - All = 98.4%

References: <br>
https://www.kaggle.com/code/rajkumarpandey02/ml-credit-card-fraud-detection <br>
https://imbalanced-learn.org/stable/index.html <br>
https://scikit-learn.org/stable/ <br>
https://builtin.com/data-science/train-test-split <br>
https://www.youtube.com/watch?v=kZNkaNATmd8 <br>
https://didatica.tech/o-que-e-e-como-funciona-o-algoritmo-knn/