In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# Load dataset
data = pd.read_csv('Creditcard_data.csv')

# Separate features and target variable
X = data.drop(columns=['Class', 'Time'], errors='ignore')
y = data['Class']

# Parameters for sample size formula
Z = 1.96  # Z-score for 95% confidence
p = 0.5   # Assumed proportion
e = 0.05  # Margin of error
S = 2     # Number of strata
C = 10    # Average size of clusters

N = len(X)  # Total population size

# Calculate sample sizes
simple_random_sample_size = int((Z**2 * p * (1 - p)) / e**2)
stratified_sample_size = int((Z**2 * p * (1 - p)) / (e / S)**2)
cluster_sample_size = int((Z**2 * p * (1 - p)) / (e / C)**2)

print(f"Simple Random Sample Size: {simple_random_sample_size}")
print(f"Stratified Sample Size: {stratified_sample_size}")
print(f"Cluster Sample Size: {cluster_sample_size}")

# 1. Simple Random Sampling
simple_random_sample = data.sample(n=simple_random_sample_size, random_state=42)

# 2. Stratified Sampling
def stratified_sampling(data, stratified_sample_size):
    stratified_sample = data.groupby('Class', group_keys=False).apply(
        lambda x: x.sample(n=min(len(x), stratified_sample_size // S), random_state=42))
    return stratified_sample

stratified_sample = stratified_sampling(data, stratified_sample_size)

# 3. Cluster Sampling
def cluster_sampling(data, num_clusters, num_selected_clusters):
    data['Cluster'] = np.random.randint(0, num_clusters, size=len(data))
    selected_clusters = np.random.choice(range(num_clusters), size=num_selected_clusters, replace=False)
    cluster_sample = data[data['Cluster'].isin(selected_clusters)].drop(columns=['Cluster'], errors='ignore')
    return cluster_sample

num_clusters = 10
num_selected_clusters = 3
cluster_sample = cluster_sampling(data, num_clusters, num_selected_clusters)

# 4. Systematic Sampling
def systematic_sampling(data, sample_size):
    step = len(data) // sample_size
    indices = np.arange(0, len(data), step)
    return data.iloc[indices[:sample_size]]

systematic_sample = systematic_sampling(data, simple_random_sample_size)

# 5. Bootstrap Sampling
bootstrap_sample = data.sample(n=N, replace=True, random_state=42)

# 6. SMOTE Sampling (Synthetic Oversampling)
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)
smote_sample = pd.concat([pd.DataFrame(X_smote), pd.Series(y_smote, name='Class')], axis=1)

# Models to evaluate
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
}

# Store sampling techniques
samples = {
    "Simple Random": simple_random_sample,
    "Stratified": stratified_sample,
    "Cluster": cluster_sample,
    "Systematic": systematic_sample,
    "Bootstrap": bootstrap_sample,
    "SMOTE": smote_sample,
}

# Store results
results = []

for sample_name, sample_data in samples.items():
    X_sample = sample_data.drop(columns=['Class'], errors='ignore')
    y_sample = sample_data['Class']
    if len(y_sample.unique()) < 2:  # Ensure at least two classes are present
        print(f"Skipping {sample_name} due to single class issue.")
        continue
    X_train, X_test, y_train, y_test = train_test_split(
        X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample
    )
    for model_name, model in models.items():
        # Check for at least two classes in training and testing sets
        if len(y_train.unique()) > 1 and len(y_test.unique()) > 1:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            results.append({"Sampling": sample_name, "Model": model_name, "Accuracy": acc})
        else:
            results.append({"Sampling": sample_name, "Model": model_name, "Accuracy": "Single class issue"})

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results)

# Pivot the DataFrame to organize results by sampling techniques and models
pivot_results_df = results_df.pivot(index="Model", columns="Sampling", values="Accuracy")
pivot_results_df.reset_index(inplace=True)

# Display the resulting DataFrame
#import ace_tools as tools; tools.display_dataframe_to_user(name="Sampling Techniques and Model Accuracy", dataframe=pivot_results_df)

display(pivot_results_df)

# Find the best sampling technique for each model
def find_best_sampling(results_df):
    best_results = {}
    for model in results_df['Model']:
        # For each model, find the sampling technique with the highest accuracy
        model_results = results_df[results_df['Model'] == model]
        best_sampling = model_results.iloc[:, 1:].idxmax(axis=1).values[0]  # Column with the highest value
        best_accuracy = model_results.iloc[:, 1:].max(axis=1).values[0]     # Highest accuracy value
        best_results[model] = (best_sampling, best_accuracy)
    return best_results

# Analyze and find the best sampling techniques
best_sampling_results = find_best_sampling(pivot_results_df)

# Display results
print("Best Sampling Technique for Each Model:")
for model, (sampling, accuracy) in best_sampling_results.items():
    print(f"Model: {model}, Best Sampling: {sampling}")


Simple Random Sample Size: 384
Stratified Sample Size: 1536
Cluster Sample Size: 38415


  stratified_sample = data.groupby('Class', group_keys=False).apply(


Sampling,Model,Bootstrap,Cluster,SMOTE,Simple Random,Stratified,Systematic
0,KNN,0.980645,0.979167,0.915033,0.987013,0.987097,0.987013
1,Logistic Regression,0.987097,0.979167,0.921569,0.987013,0.987097,0.987013
2,Naive Bayes,0.929032,0.9375,0.728758,0.831169,0.96129,0.987013
3,Random Forest,1.0,0.979167,0.990196,0.987013,0.987097,0.987013
4,SVM,0.987097,0.979167,0.722222,0.987013,0.987097,0.987013


Best Sampling Technique for Each Model:
Model: KNN, Best Sampling: Stratified
Model: Logistic Regression, Best Sampling: Bootstrap
Model: Naive Bayes, Best Sampling: Systematic
Model: Random Forest, Best Sampling: Bootstrap
Model: SVM, Best Sampling: Bootstrap
