In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Load the dataset
data_url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
data = pd.read_csv(data_url)

# Separate features and target
X = data.drop('Class', axis=1)
y = data['Class']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 1: Apply 5 different sampling techniques
# Sampling 1: Downsample the majority class
class_0 = data[data['Class'] == 0]
class_1 = data[data['Class'] == 1]
class_0_downsampled = class_0.sample(len(class_1), random_state=42)
data_downsampled = pd.concat([class_0_downsampled, class_1])

# Sampling 2: Upsample the minority class
class_1_upsampled = class_1.sample(len(class_0), replace=True, random_state=42)
data_upsampled = pd.concat([class_0, class_1_upsampled])

# Sampling 3: Mixed sampling (downsample majority, upsample minority)
data_mixed = pd.concat([
    class_0.sample(len(class_1), random_state=42),
    class_1.sample(len(class_0), replace=True, random_state=42)
])

# Sampling 4: Random sampling (subset of data for quick testing)
data_random = data.sample(frac=0.5, random_state=42)

# Sampling 5: Original data (no balancing)
data_original = data

sampling_methods = {
    "Sampling1": data_downsampled,
    "Sampling2": data_upsampled,
    "Sampling3": data_mixed,
    "Sampling4": data_random,
    "Sampling5": data_original
}

# Models to test
models = {
    "M1": LogisticRegression(max_iter=2000, random_state=42),
    "M2": DecisionTreeClassifier(random_state=42),
    "M3": RandomForestClassifier(random_state=42),
    "M4": SVC(kernel='linear', random_state=42),
    "M5": SVC(kernel='rbf', random_state=42)
}

# Store results
results = []

# Step 2: Train and evaluate models for each sampling technique
for sampling_name, sampled_data in sampling_methods.items():
    X_sampled = sampled_data.drop('Class', axis=1)
    y_sampled = sampled_data['Class']

    # Standardize the sampled data
    X_sampled_scaled = scaler.fit_transform(X_sampled)

    X_train, X_test, y_train, y_test = train_test_split(
        X_sampled_scaled, y_sampled, test_size=0.3, random_state=42
    )

    for model_name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate accuracy
        acc = accuracy_score(y_test, y_pred)

        # Store the results
        results.append({"Sampling": sampling_name, "Model": model_name, "Accuracy": acc})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Step 3: Display the results
print(results_df)

# Step 4: Find the best model for each sampling method
best_results = results_df.loc[results_df.groupby(["Sampling"])['Accuracy'].idxmax()]
print("\nBest Results for Each Sampling Method:")
print(best_results)


     Sampling Model  Accuracy
0   Sampling1    M1  0.333333
1   Sampling1    M2  0.666667
2   Sampling1    M3  0.333333
3   Sampling1    M4  0.666667
4   Sampling1    M5  0.166667
5   Sampling2    M1  0.908297
6   Sampling2    M2  0.995633
7   Sampling2    M3  1.000000
8   Sampling2    M4  0.914847
9   Sampling2    M5  0.975983
10  Sampling3    M1  0.987069
11  Sampling3    M2  0.987069
12  Sampling3    M3  0.987069
13  Sampling3    M4  0.995690
14  Sampling3    M5  0.987069
15  Sampling4    M1  0.965517
16  Sampling4    M2  0.948276
17  Sampling4    M3  0.965517
18  Sampling4    M4  0.965517
19  Sampling4    M5  0.965517
20  Sampling5    M1  0.982759
21  Sampling5    M2  0.969828
22  Sampling5    M3  0.987069
23  Sampling5    M4  0.982759
24  Sampling5    M5  0.987069

Best Results for Each Sampling Method:
     Sampling Model  Accuracy
1   Sampling1    M2  0.666667
7   Sampling2    M3  1.000000
13  Sampling3    M4  0.995690
15  Sampling4    M1  0.965517
22  Sampling5    M3  0.987069
