#Assignment-Sampling

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

Step 1: Load the dataset

In [None]:
data = pd.read_csv('Creditcard_data.csv')
X = data.drop('Class', axis=1)
y = data['Class']

Step 2: Balance the dataset

In [None]:
oversampler = RandomOverSampler(random_state=42)
undersampler = RandomUnderSampler(random_state=42)
X_over, y_over = oversampler.fit_resample(X, y)
X_under, y_under = undersampler.fit_resample(X, y)


Step 3: Sample size detection formula

In [None]:
def calculate_sample_size(total, margin_of_error=0.05, confidence=0.95):
    z = 1.96  # 95% confidence level
    p = 0.5  # Maximum variability
    n = (z**2 * p * (1 - p)) / (margin_of_error**2)
    return min(int(n), total)

sample_size = calculate_sample_size(len(X))


Step 4: Sampling techniques

In [None]:

# Simple Random Sampling
X_simple, _, y_simple, _ = train_test_split(X_over, y_over, train_size=sample_size, random_state=1)


In [None]:
# Stratified Sampling
X_stratified, _, y_stratified, _ = train_test_split(X, y, train_size=sample_size, stratify=y, random_state=3)


In [None]:
# Cluster Sampling
def cluster_sample(X, y, clusters, size):
    groups = np.array_split(X.index, clusters)
    selected = np.concatenate(groups[:size // (len(X) // clusters)])
    return X.loc[selected], y.loc[selected]

X_cluster, y_cluster = cluster_sample(X_over, y_over, 10, sample_size)

In [None]:
# Bootstrap Sampling
def bootstrap_sample(X, y, size):
    indices = np.random.choice(len(X), size=size, replace=True)
    return X.iloc[indices], y.iloc[indices]

X_bootstrap, y_bootstrap = bootstrap_sample(X_over, y_over, sample_size)

Step 5: Train models

In [None]:

models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

samples = {
    "Simple Random": (X_simple, y_simple),

    "Stratified": (X_stratified, y_stratified),
    "Cluster": (X_cluster, y_cluster),
    "Bootstrap": (X_bootstrap, y_bootstrap)
}

results = []
for model_name, model in models.items():
    for sample_name, (X_samp, y_samp) in samples.items():
        X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.3, random_state=42)
        model.fit(X_train, y_train)
        accuracy = accuracy_score(y_test, model.predict(X_test))
        results.append({"Model": model_name, "Sample": sample_name, "Accuracy": accuracy})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Step 6: Results

In [None]:
# Step 6: Results
results_df = pd.DataFrame(results)
best_results = results_df.loc[results_df.groupby("Model")['Accuracy'].idxmax()]

print("All Results:")
print(results_df)
print("\nBest Results:")
print(best_results)

# Save results to CSV
results_df.to_csv('all_results.csv', index=False)
best_results.to_csv('best_results.csv', index=False)


All Results:
                  Model         Sample  Accuracy
0         Random Forest  Simple Random  0.991379
1         Random Forest     Stratified  0.991379
2         Random Forest        Cluster  0.978261
3         Random Forest      Bootstrap  1.000000
4   Logistic Regression  Simple Random  0.853448
5   Logistic Regression     Stratified  0.991379
6   Logistic Regression        Cluster  0.956522
7   Logistic Regression      Bootstrap  0.913793
8                   SVM  Simple Random  0.750000
9                   SVM     Stratified  0.991379
10                  SVM        Cluster  0.978261
11                  SVM      Bootstrap  0.724138
12                  KNN  Simple Random  0.956897
13                  KNN     Stratified  0.991379
14                  KNN        Cluster  0.978261
15                  KNN      Bootstrap  0.939655
16        Decision Tree  Simple Random  0.982759
17        Decision Tree     Stratified  0.982759
18        Decision Tree        Cluster  0.978261
19     

In [None]:
import pickle
filename = 'sampling.pkl'  # File name for the pickled model
with open(filename, 'wb') as file:
    pickle.dump(model, file)

print(f"Model saved as {filename}")

Model saved as sampling.pkl


In [None]:
from google.colab import files
files.download(filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Load the pickled model
with open(filename, 'rb') as file:
    loaded_model = pickle.load(file)

# Use the loaded model for predictions
predictions = loaded_model.predict(X_test)
print("Predictions:", predictions)

Predictions: [1 1 0 0 0 0 1 1 0 0 0 0 1 1 1 1 1 1 0 1 0 1 0 0 1 0 0 1 1 1 1 1 0 1 0 1 0
 1 1 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0
 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 1 1 1 1
 0 1 1 0 1]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Save the pickle file to Google Drive
pickle_path = '/content/drive/My Drive/sampling.pkl'
with open(pickle_path, 'wb') as file:
    pickle.dump(model, file)

print(f"Model saved to {pickle_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved to /content/drive/My Drive/sampling.pkl
