In [48]:
import pandas as pd
from sklearn.utils import resample
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [5]:
raw_data = pd.read_csv(r"../Downloads/Creditcard_data.csv")

In [19]:
raw_data['Class'].value_counts()

Class
0    763
1      9
Name: count, dtype: int64

In [27]:
majority_class = raw_data[raw_data['Class'] == 0]
minority_classes = raw_data[raw_data['Class'] != 0]

# Upsample the minority classes
minority_upsampled = resample(minority_classes,
                              replace=True, 
                              n_samples=len(majority_class))  

# Combine majority and minority and thus shufle the result
df_upsampled = pd.concat([majority_class, minority_upsampled])
df_upsampled = df_upsampled.sample(frac=1).reset_index(drop=True)

In [31]:
df_upsampled['Class'].value_counts()

Class
0    763
1    763
Name: count, dtype: int64

In [35]:
def calculate_sample_size(population_size, proportion=0.5):
    Z = 1.96
    E = 0.05
    p = proportion
    n = (Z**2 * p * (1 - p)) / (E**2)
    # Adjust for finite population
    n = n / (1 + (n - 1) / population_size)
    return int(np.ceil(n))

In [58]:
# Sampling methods with calculated sample size
def simple_random_sampling(X, y):
    size= calculate_sample_size(len(x))
    population_size=len(X)
    X_sample, _, y_sample, _ = train_test_split(X, y, train_size=size/population_size, random_state=42)
    return X_sample, y_sample

def systematic_sampling(X, y):
    size= calculate_sample_size(len(x))
    population_size=len(X)
    step = len(X) // size
    indices = np.arange(len(X))
    systematic_indices = indices[::step][:size]
    return X.iloc[systematic_indices], y.iloc[systematic_indices]

def stratified_sampling(X, y):
    size= calculate_sample_size(len(x))
    population_size=len(X)
    X_sample, _, y_sample, _ = train_test_split(X, y, train_size=size/population_size, stratify=y, random_state=42)
    return X_sample, y_sample

def cluster_sampling(X, y,n_clusters=3):
    clusters = np.array_split(X.index, n_clusters)
    chosen_cluster = clusters[np.random.choice(len(clusters))]
    return X.loc[chosen_cluster], y.loc[chosen_cluster]

def bootstrap_sampling(X, y):
    size= calculate_sample_size(len(x))
    population_size=len(X)
    X_resampled, y_resampled = resample(X, y, replace=True, n_samples=size, random_state=42)
    return X_resampled, y_resampled

In [60]:
# Split into features and labels
x = df_upsampled.drop(columns=["Class"])  # Replace 'target' with your label column name
y = df_upsampled["Class"]

samples=create_samples(x,y)

# Models to Evaluate
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "XGBoost": XGBClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

# Apply each sampling method
sampling_methods = {
    "Simple Random": simple_random_sampling,
    "Systematic": systematic_sampling,
    "Stratified": stratified_sampling,
    "Cluster": cluster_sampling,
    "Bootstrap": bootstrap_sampling
}

results = []

for model_name, model in models.items():
    for sampling_name, sampling_func in sampling_methods.items():
        # Apply sampling
        X_sample, y_sample = sampling_func(x, y)
        X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Store results
        results.append({"Model": model_name, "Sampling": sampling_name, "Accuracy": accuracy})

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [68]:
pivot_table = results_df.pivot(index="Sampling", columns="Model", values="Accuracy")
print(pivot_table)

Model          K-Nearest Neighbors  Logistic Regression  Random Forest  \
Sampling                                                                 
Bootstrap                 0.935484             0.913978       1.000000   
Cluster                   0.973856             0.888889       1.000000   
Simple Random             0.903226             0.913978       0.989247   
Stratified                0.946237             0.903226       0.978495   
Systematic                0.956989             0.967742       1.000000   

Model          Support Vector Machine   XGBoost  
Sampling                                         
Bootstrap                    0.698925  1.000000  
Cluster                      0.692810  0.993464  
Simple Random                0.698925  0.967742  
Stratified                   0.677419  0.989247  
Systematic                   0.763441  0.978495  
