In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
import numpy as np


credit_card_data = pd.read_csv('Creditcard_data.csv')


features = credit_card_data.drop(columns=["Class"])
labels = credit_card_data["Class"]


smote_resampler = SMOTE(random_state=42)
balanced_features, balanced_labels = smote_resampler.fit_resample(features, labels)


def systematic_sample(features, labels, sample_size):
    step = len(features) // sample_size
    indices = np.arange(0, len(features), step)[:sample_size]
    return features.iloc[indices], labels.iloc[indices]

def cluster_sample(features, labels, sample_size, n_clusters=10):
    cluster_size = len(features) // n_clusters
    clusters = [features.iloc[i:i + cluster_size] for i in range(0, len(features), cluster_size)]
    label_clusters = [labels.iloc[i:i + cluster_size] for i in range(0, len(labels), cluster_size)]
    selected_clusters = np.random.choice(len(clusters), size=n_clusters // 2, replace=False)
    sampled_features = pd.concat([clusters[i] for i in selected_clusters])
    sampled_labels = pd.concat([label_clusters[i] for i in selected_clusters])

    return sampled_features.iloc[:sample_size], sampled_labels.iloc[:sample_size]

# Define sample sizes
sample_sizes = [500, 1000, 1500, 2000, 2500]

# Generate samples using different methods
sampling_methods = {
    "Systematic Sample 500": systematic_sample(balanced_features, balanced_labels, sample_sizes[0]),
    "Cluster Sample 1000": cluster_sample(balanced_features, balanced_labels, sample_sizes[1]),
    "Systematic Sample 1500": systematic_sample(balanced_features, balanced_labels, sample_sizes[2]),
    "Cluster Sample 2000": cluster_sample(balanced_features, balanced_labels, sample_sizes[3]),
    "Cluster Sample 2500": cluster_sample(balanced_features, balanced_labels, sample_sizes[4]),
}

#Define Classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Gradient Boosting": GradientBoostingClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB()
}

results = pd.DataFrame()

for sample_label, (X_sample, y_sample) in sampling_methods.items():
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

    sample_results = {}
    for model_label, model in classifiers.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        acc = accuracy_score(y_test, predictions)
        sample_results[model_label] = acc

    sample_results["Sampling Applied"] = sample_label
    results = pd.concat([results, pd.DataFrame([sample_results])], ignore_index=True)

formatted_results = results.set_index("Sampling Applied").T
formatted_results.index = formatted_results.index.str.replace("_", " ")
formatted_results = formatted_results.reset_index().rename(columns={"index": "Model"})

formatted_results.to_csv('sampling_model_results_formatted.csv', index=False)
print("Formatted results saved to 'sampling_model_results_formatted.csv'")
best_sampling_per_model = formatted_results.set_index("Model").idxmax(axis=1)
print("Best Sampling Technique for Each Model:")
print(best_sampling_per_model)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Formatted results saved to 'sampling_model_results_formatted.csv'
Best Sampling Technique for Each Model:
Model
Logistic Regression    Cluster Sample 2500
Gradient Boosting      Cluster Sample 2500
K-Nearest Neighbors    Cluster Sample 2000
Decision Tree          Cluster Sample 2000
Naive Bayes            Cluster Sample 1000
dtype: object
