<a href="https://colab.research.google.com/github/DeoraHarleen/Machine-Learning/blob/main/Sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

# Function to load the dataset from GitHub
def load_dataset():
    # Define the URL of the raw CSV file on GitHub
    url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"

    # Load the dataset into a DataFrame
    data = pd.read_csv(url)

    return data

# Evaluate models with different sampling techniques
results = []

# Load the dataset
dataset = load_dataset()

# Define models and sampling techniques as you did in your original code
models = {
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Clustering': KMeans(n_clusters=2),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

sampling_techniques = {
    'Simple Random Sampling': lambda sample_size: resample(dataset, n_samples=sample_size, random_state=42),
    'Systematic Sampling': lambda sample_size: dataset.iloc[np.arange(0, len(dataset), len(dataset)//sample_size)],
    'Bootstrap Sampling': lambda sample_size: resample(dataset, n_samples=sample_size, replace=True, random_state=42),
    'Cluster Sampling': lambda sample_size: resample(dataset, n_samples=sample_size, stratify=dataset['Class'], random_state=42),
    'Cross-Validation': lambda sample_size: dataset  # Placeholder for cross-validation
}

samples = [100, 200, 300]  # Adjust sample sizes as needed

for model_name, model in models.items():
    for sampling_name, sampling_function in sampling_techniques.items():
        accuracies = []
        for sample in samples:
            sampled_data = sampling_function(sample)
            X = sampled_data.drop('Class', axis=1)
            y = sampled_data['Class']

            if sampling_name == 'Cross-Validation':
                # Use cross-validation as a special case
                cv = KFold(n_splits=5, shuffle=True, random_state=42)
                model_cv = cross_val_score(model, X, y, cv=cv)
                accuracy = np.mean(model_cv)
            else:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

                # Train the model
                model.fit(X_train, y_train)

                # Evaluate on the test set
                y_pred = model.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)

            accuracies.append(accuracy)

        average_accuracy = np.mean(accuracies)
        result_entry = {'Model': model_name, 'Sampling Technique': sampling_name, 'Average Accuracy': average_accuracy}
        results.append(result_entry)

# Store results in a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv('sampling_results.csv', index=False)

# Print the best combination
best_combination = results_df.loc[results_df['Average Accuracy'].idxmax()]
print(f"The best combination is {best_combination['Model']} - {best_combination['Sampling Technique']} with average accuracy: {best_combination['Average Accuracy']}")




The best combination is KNN - Cross-Validation with average accuracy: 0.9883619606200252
