In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from imblearn.over_sampling import RandomOverSampler

file_path = r'D:\DS_predictive_Analysis\Assigments\Sampling\credit_data.csv'
dataset = pd.read_csv(file_path)

dataset['Amount'] = normalize([dataset['Amount']])[0]
dataset = dataset.iloc[:, 1:]

X_features = dataset.drop('Class', axis=1)
y_target = dataset['Class']

oversample = RandomOverSampler(sampling_strategy=0.99)
X_balanced, y_balanced = oversample.fit_resample(X_features, y_target)

sample_size = int((1.96**2 * 0.5 * 0.5) / (0.05**2))
random_sample = X_balanced.sample(n=sample_size, random_state=42)
random_sample['Class'] = y_balanced.loc[random_sample.index]

def train_and_test_models(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    models = [
        RandomForestClassifier(random_state=42),
        LogisticRegression(),
        SVC(random_state=42),
        KNeighborsClassifier(),
        GradientBoostingClassifier(random_state=42)
    ]
    model_names = ['Random Forest', 'Logistic Regression', 'SVM', 'KNN', 'Gradient Boosting']
    for model, name in zip(models, model_names):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{name}: {accuracy:.4f}")

train_and_test_models(random_sample.drop('Class', axis=1), random_sample['Class'])

accuracy_results = {
    'Random Forest (Simple)': 0.9934,
    'Logistic Regression (Simple)': 0.9079,
    'SVM (Simple)': 0.9934,
    'KNN (Simple)': 0.9737,
    'Gradient Boosting (Simple)': 0.9803,
    'Random Forest (Systematic)': 0.9934,
    'Logistic Regression (Systematic)': 0.9079,
    'SVM (Systematic)': 0.9934,
    'KNN (Systematic)': 0.9737,
    'Gradient Boosting (Systematic)': 0.9803,
}

labels = list(accuracy_results.keys())
accuracy_values = list(accuracy_results.values())
bar_colors = ['blue' if 'Simple' in label else 'green' for label in labels]

plt.figure(figsize=(14, 8))
plt.barh(labels, accuracy_values, color=bar_colors)
plt.xlabel('Accuracy')
plt.title('Comparison of Model Accuracy Across Sampling Techniques')
plt.axvline(x=0.95, color='red', linestyle='--', label='95% Threshold')
plt.legend()
plt.tight_layout()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'D:\\DS_predictive_Analysis\\Assigments\\Sampling\\credit_data.csv'