In [4]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# ---------------------------------------------------------
# 1. LOAD DATASET
# ---------------------------------------------------------
# Using the raw URL to download directly into pandas
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
try:
    data = pd.read_csv(url)
    print("Dataset loaded successfully.")
except:
    print("Error loading data. Please ensure the URL is correct or upload the CSV manually.")

# ---------------------------------------------------------
# 2. BALANCE THE DATASET
# ---------------------------------------------------------
# The dataset is highly imbalanced. We need to balance it as per instructions.
# We will undersample the majority class (0) to match the minority class (1).

class_0 = data[data['Class'] == 0]
class_1 = data[data['Class'] == 1]

# Undersample Class 0
class_0_balanced = class_0.sample(n=len(class_1), random_state=42)

# Concatenate to create balanced dataset
balanced_df = pd.concat([class_0_balanced, class_1]).reset_index(drop=True)

print(f"Original Dataset Size: {len(data)}")
print(f"Balanced Dataset Size: {len(balanced_df)}")
print(f"Class Distribution in Balanced Data:\n{balanced_df['Class'].value_counts()}")
print("-" * 50)

# ---------------------------------------------------------
# 3. DEFINE SAMPLING TECHNIQUES
# ---------------------------------------------------------
# We need 5 different sampling techniques.
# NOTE: Formulas for sample size (n) can vary. We will use the Slovin's formula
# or standard heuristic sizes for demonstration to ensure code runs easily.

# Formula for sample size (Slovin's): n = N / (1 + N*e^2)
# Calculating a generic sample size 'n' to use for the samples
N = len(balanced_df)
e = 0.05 # Margin of error
n_slovin = int(N / (1 + N * (e**2)))

def sampling_1_simple_random(df, n):
    """Simple Random Sampling"""
    return df.sample(n=n, random_state=42)

def sampling_2_systematic(df, n):
    """Systematic Sampling"""
    step = len(df) // n
    indices = np.arange(0, len(df), step)[:n]
    return df.iloc[indices]

def sampling_3_stratified(df, n):
    """Stratified Sampling (maintaining class ratio)"""
    # Ensure n is even for splitting between two classes
    n_per_class = int(n/2) if n % 2 == 0 else int(n/2) + (n % 2)
    return df.groupby('Class', group_keys=False).apply(lambda x: x.sample(n_per_class, random_state=42) if len(x) >= n_per_class else x.sample(len(x), random_state=42))

def sampling_4_cluster(df, n):
    """Cluster Sampling"""
    # Create fake clusters for demonstration
    df_temp = df.copy()
    num_clusters = 5
    df_temp['cluster'] = np.random.randint(0, num_clusters, size=len(df))

    # Select random clusters until we reach rough sample size
    # To ensure we get a sample, we might select more than one cluster if the first is too small
    sample_size_achieved = 0
    selected_clusters = pd.DataFrame()
    cluster_ids = np.random.permutation(num_clusters) # Randomize cluster selection order

    for cluster_id in cluster_ids:
        current_cluster_sample = df_temp[df_temp['cluster'] == cluster_id]
        selected_clusters = pd.concat([selected_clusters, current_cluster_sample])
        sample_size_achieved = len(selected_clusters)
        if sample_size_achieved >= n:
            break

    # If after selecting all clusters, still less than n, just return all selected
    if len(selected_clusters) == 0:
        # Fallback if no clusters were large enough to contribute a sample (very unlikely)
        return df.sample(n=n, random_state=42) # Revert to simple random if cluster fails

    return selected_clusters.head(n).drop(columns=['cluster'])

def sampling_5_bootstrap(df, n):
    """Bootstrap Sampling (Sampling with replacement)"""
    return df.sample(n=n, replace=True, random_state=42)

# Dictionary of sampling functions
sampling_techniques = {
    "Simple Random": lambda df: sampling_1_simple_random(df, n_slovin),
    "Systematic": lambda df: sampling_2_systematic(df, n_slovin),
    "Stratified": lambda df: sampling_3_stratified(df, n_slovin),
    "Cluster": lambda df: sampling_4_cluster(df, n_slovin),
    "Bootstrap": lambda df: sampling_5_bootstrap(df, n_slovin)
}

# ---------------------------------------------------------
# 4. DEFINE MODELS
# ---------------------------------------------------------
models = {
    "M1 (Logistic Regression)": LogisticRegression(max_iter=1000),
    "M2 (Decision Tree)": DecisionTreeClassifier(random_state=42),
    "M3 (Random Forest)": RandomForestClassifier(random_state=42),
    "M4 (SVM)": SVC(probability=True, random_state=42), # Added probability=True and random_state for consistency
    "M5 (KNN)": KNeighborsClassifier()
}

# ---------------------------------------------------------
# 5. EXECUTE AND EVALUATE
# ---------------------------------------------------------
results = {}

print("Starting Model Evaluation...")

for method_name, sampler in sampling_techniques.items():
    print(f"Running {method_name} Sampling...")

    # Generate Sample
    sample_df = sampler(balanced_df)

    # Handle cases where sample_df might be empty or too small for train_test_split
    if len(sample_df) < 2: # Need at least 2 samples for split
        print(f"Skipping {method_name} due to insufficient sample size ({len(sample_df)}).")
        method_accuracies = {model_name: np.nan for model_name in models.keys()}
        results[method_name] = method_accuracies
        continue

    # Split features and target
    X = sample_df.drop('Class', axis=1)
    y = sample_df['Class']

    # Ensure there are enough unique classes in y for stratification if used by train_test_split
    if len(y.unique()) < 2: # Need at least two classes for classification
        print(f"Skipping {method_name} due to insufficient classes in sample.")
        method_accuracies = {model_name: np.nan for model_name in models.keys()}
        results[method_name] = method_accuracies
        continue

    # Train/Test Split
    # Adjust test_size if sample is too small to ensure both train/test sets are not empty
    test_size_val = 0.2
    if len(sample_df) * test_size_val < 1: # If test set would be less than 1, adjust
        test_size_val = 0.5 if len(sample_df) > 1 else 0.0

    # Use stratify=y if possible to maintain class distribution
    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_val, random_state=42, stratify=y)
    except ValueError as e:
        print(f"Warning: Could not stratify split for {method_name}: {e}. Trying without stratification.")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_val, random_state=42)

    # Store accuracy for this sampling method
    method_accuracies = {}

    for model_name, model in models.items():
        try:
            # Ensure train and test sets are not empty before fitting and predicting
            if len(X_train) == 0 or len(X_test) == 0:
                method_accuracies[model_name] = np.nan
                print(f"Warning: Not enough data for training/testing with {method_name} and {model_name}.")
                continue

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            method_accuracies[model_name] = acc
        except Exception as e:
            method_accuracies[model_name] = np.nan # Store as NaN instead of string

    results[method_name] = method_accuracies

# ---------------------------------------------------------
# 6. OUTPUT RESULTS
# ---------------------------------------------------------
# Convert results to DataFrame for the final table
final_df = pd.DataFrame(results)

print("\n" + "="*50)
print("FINAL ACCURACY TABLE")
print("="*50)
print(final_df)

# Determine which technique gave the highest accuracy for each model
print("\n" + "="*50)
print("BEST SAMPLING TECHNIQUE PER MODEL")
print("="*50)
for model_name in models.keys(): # Renamed 'model' to 'model_name' to avoid conflict with the model object
    # Convert the row to numeric, coercing errors to NaN
    model_results = pd.to_numeric(final_df.loc[model_name], errors='coerce')

    if not model_results.isnull().all(): # Check if there are any valid numeric results
        best_technique = model_results.idxmax()
        best_acc = model_results.max()
        print(f"{model_name}: Highest Accuracy ({best_acc:.4f}) with {best_technique}")
    else:
        print(f"{model_name}: No valid accuracy results to determine best technique.")

print("\nExecution Complete.")

Dataset loaded successfully.
Original Dataset Size: 772
Balanced Dataset Size: 18
Class Distribution in Balanced Data:
Class
0    9
1    9
Name: count, dtype: int64
--------------------------------------------------
Starting Model Evaluation...
Running Simple Random Sampling...
Running Systematic Sampling...
Running Stratified Sampling...


  return df.groupby('Class', group_keys=False).apply(lambda x: x.sample(n_per_class, random_state=42) if len(x) >= n_per_class else x.sample(len(x), random_state=42))


Running Cluster Sampling...
Running Bootstrap Sampling...

FINAL ACCURACY TABLE
                          Simple Random  Systematic  Stratified  Cluster  \
M1 (Logistic Regression)           0.75        0.50        0.50     0.50   
M2 (Decision Tree)                 0.50        1.00        0.75     0.25   
M3 (Random Forest)                 0.50        0.75        0.50     0.25   
M4 (SVM)                           0.75        0.50        0.75     0.50   
M5 (KNN)                           0.75        0.50        0.25     0.50   

                          Bootstrap  
M1 (Logistic Regression)        1.0  
M2 (Decision Tree)              1.0  
M3 (Random Forest)              1.0  
M4 (SVM)                        0.5  
M5 (KNN)                        0.5  

BEST SAMPLING TECHNIQUE PER MODEL
M1 (Logistic Regression): Highest Accuracy (1.0000) with Bootstrap
M2 (Decision Tree): Highest Accuracy (1.0000) with Systematic
M3 (Random Forest): Highest Accuracy (1.0000) with Bootstrap
M4 (SVM): 