In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import numpy as np

In [2]:
credit_card_data = pd.read_csv('Creditcard_data.csv')


features = credit_card_data.drop(columns=["Class"])
labels = credit_card_data["Class"]


smote_resampler = SMOTE(random_state=42)
balanced_features, balanced_labels = smote_resampler.fit_resample(features, labels)

In [3]:
def random_sample(features, labels, sample_size):
    return resample(features, labels, n_samples=sample_size, random_state=42)

def stratified_sample(features, labels, sample_size):
    from sklearn.model_selection import StratifiedShuffleSplit
    strat_split = StratifiedShuffleSplit(n_splits=1, test_size=sample_size / len(labels), random_state=42)
    for train_idx, _ in strat_split.split(features, labels):
        return features.iloc[train_idx], labels.iloc[train_idx]

In [4]:
sample_sizes = [int(len(balanced_features) * 0.1 * i) for i in range(1, 6)]


sampling_methods = {
    "Sample1": random_sample(balanced_features, balanced_labels, sample_sizes[0]),
    "Sample2": random_sample(balanced_features, balanced_labels, sample_sizes[1]),
    "Sample3": stratified_sample(balanced_features, balanced_labels, sample_sizes[2]),
    "Sample4": random_sample(balanced_features, balanced_labels, sample_sizes[3]),
    "Sample5": stratified_sample(balanced_features, balanced_labels, sample_sizes[4]),
}

In [5]:
classifiers = {
    "LogReg": LogisticRegression(),
    "RandForest": RandomForestClassifier(),
    "DecTree": DecisionTreeClassifier(),
    "NaiveBayes": GaussianNB(),
    "SVM": SVC()
}

In [6]:
evaluation_results = pd.DataFrame(columns=["Sample", "Classifier", "Accuracy"])


for sample_label, (X_sample, y_sample) in sampling_methods.items():

    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

    for model_label, model in classifiers.items():

        model.fit(X_train, y_train)


        predictions = model.predict(X_test)
        acc = accuracy_score(y_test, predictions)


        evaluation_results = pd.concat([
            evaluation_results,
            pd.DataFrame([{"Sample": sample_label, "Classifier": model_label, "Accuracy": acc}])
        ], ignore_index=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  evaluation_results = pd.concat([
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the docum

In [7]:
evaluation_results.to_csv('sampling_model_results.csv', index=False)


print("Results saved to 'sampling_model_results.csv'")

Results saved to 'sampling_model_results.csv'
