In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
!pip install -q imbalanced-learn

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
import numpy as np

# Step 1: Load dataset
def load_dataset():
    url = "https://github.com/AnjulaMehto/Sampling_Assignment/raw/main/Creditcard_data.csv"
    data = pd.read_csv(url)
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    return X, y

# Step 2: Apply sampling techniques
def apply_sampling_techniques(X, y):
    techniques = {}

    # Random OverSampling
    ros = RandomOverSampler()
    X_ros, y_ros = ros.fit_resample(X, y)
    techniques["RandomOverSampler"] = (X_ros, y_ros)

    # Random UnderSampling
    rus = RandomUnderSampler()
    X_rus, y_rus = rus.fit_resample(X, y)
    techniques["RandomUnderSampler"] = (X_rus, y_rus)

    # Custom Proportional Sampling (e.g., 60% of minority, 40% of majority)
    minority_class = y.value_counts().idxmin()
    majority_class = y.value_counts().idxmax()

    minority_indices = y[y == minority_class].index
    majority_indices = y[y == majority_class].index

    sampled_minority = np.random.choice(minority_indices, size=int(0.6 * len(minority_indices)), replace=False)
    sampled_majority = np.random.choice(majority_indices, size=int(0.4 * len(majority_indices)), replace=False)

    sampled_indices = np.concatenate([sampled_minority, sampled_majority])
    X_custom = X.loc[sampled_indices]
    y_custom = y.loc[sampled_indices]
    techniques["ProportionalSampling"] = (X_custom, y_custom)

    # Stratified Sampling
    stratified_sample_indices = np.random.choice(X.index, size=int(0.5 * len(X)), replace=False)
    X_stratified = X.loc[stratified_sample_indices]
    y_stratified = y.loc[stratified_sample_indices]
    techniques["StratifiedSampling"] = (X_stratified, y_stratified)

    # NearMiss (Under-sampling technique)
    nm = NearMiss()
    X_nm, y_nm = nm.fit_resample(X, y)
    techniques["NearMiss"] = (X_nm, y_nm)

    return techniques

# Step 3: Define and evaluate models
def evaluate_models(techniques):
    models = {
        "Random Forest": RandomForestClassifier(random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "KNN": KNeighborsClassifier(),
        "SVM": SVC()
    }

    results = {}
    for technique_name, (X_sample, y_sample) in techniques.items():
        results[technique_name] = {}
        for model_name, model in models.items():
            X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=42)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            results[technique_name][model_name] = accuracy

    return results

# Step 4: Main execution
def main():
    X, y = load_dataset()
    techniques = apply_sampling_techniques(X, y)
    results = evaluate_models(techniques)

    print("Evaluation Results:")
    for technique, model_results in results.items():
        print(f"\n{technique}:")
        for model, accuracy in model_results.items():
            print(f"  {model}: {accuracy:.4f}")

if __name__ == "__main__":
    main()


Evaluation Results:

RandomOverSampler:
  Random Forest: 1.0000
  Logistic Regression: 0.9192
  Decision Tree: 0.9978
  KNN: 0.9847
  SVM: 0.6856

RandomUnderSampler:
  Random Forest: 0.5000
  Logistic Regression: 0.8333
  Decision Tree: 0.8333
  KNN: 0.6667
  SVM: 0.1667

ProportionalSampling:
  Random Forest: 0.9892
  Logistic Regression: 0.9785
  Decision Tree: 0.9785
  KNN: 0.9892
  SVM: 0.9892

StratifiedSampling:
  Random Forest: 0.9828
  Logistic Regression: 0.9828
  Decision Tree: 0.9828
  KNN: 0.9828
  SVM: 0.9828

NearMiss:
  Random Forest: 0.1667
  Logistic Regression: 0.5000
  Decision Tree: 0.1667
  KNN: 0.8333
  SVM: 0.1667
