<a href="https://colab.research.google.com/github/Aryansood18/Assignments-predictive-analytics/blob/main/predictive_analytics1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


In [21]:
df = pd.read_csv("Creditcard_data.csv")

features = df.drop("Class", axis=1)
labels = df["Class"]


In [22]:
combined = pd.concat([features, labels], axis=1)

major_class = combined[combined["Class"] == 0]
minor_class = combined[combined["Class"] == 1]

minor_resampled = minor_class.sample(
    n=len(major_class),
    replace=True,
    random_state=21
)

balanced_df = pd.concat([major_class, minor_resampled]).sample(
    frac=1,
    random_state=21
)

X_data = balanced_df.drop("Class", axis=1)
y_data = balanced_df["Class"]


In [25]:
model_bank = {
    "M1_LogisticRegression": LogisticRegression(max_iter=1000),
    "M2_KNN": KNeighborsClassifier(),
    "M3_DecisionTree": DecisionTreeClassifier(random_state=9),
    "M4_RandomForest": RandomForestClassifier(random_state=9),
    "M5_SVM": SVC()
}


In [24]:
def random_holdout():
    return train_test_split(
        X_data, y_data,
        test_size=0.25,
        random_state=11,
        shuffle=True
    )

def stratified_holdout():
    return train_test_split(
        X_data, y_data,
        test_size=0.25,
        random_state=12,
        stratify=y_data
    )

def bootstrap_holdout():
    boot_df = pd.concat([X_data, y_data], axis=1).sample(
        frac=1,
        replace=True,
        random_state=13
    )
    return train_test_split(
        boot_df.drop("Class", axis=1),
        boot_df["Class"],
        test_size=0.25,
        random_state=13
    )

def kfold_holdout():
    kf = KFold(n_splits=5, shuffle=True, random_state=14)
    tr, te = next(kf.split(X_data))
    return X_data.iloc[tr], X_data.iloc[te], y_data.iloc[tr], y_data.iloc[te]

def stratified_kfold_holdout():
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
    tr, te = next(skf.split(X_data, y_data))
    return X_data.iloc[tr], X_data.iloc[te], y_data.iloc[tr], y_data.iloc[te]


In [26]:
sampling_methods = [
    random_holdout,
    stratified_holdout,
    bootstrap_holdout,
    kfold_holdout,
    stratified_kfold_holdout
]

sampling_labels = [
    "Sampling1",
    "Sampling2",
    "Sampling3",
    "Sampling4",
    "Sampling5"
]


In [27]:
results = pd.DataFrame(
    index=model_bank.keys(),
    columns=sampling_labels
)


In [28]:
for idx, sampler in enumerate(sampling_methods):
    X_train, X_test, y_train, y_test = sampler()

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for model_name, model in model_bank.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        results.loc[model_name, sampling_labels[idx]] = round(
            accuracy_score(y_test, predictions) * 100, 2
        )


In [29]:
results = results.astype(float)
results


Unnamed: 0,Sampling1,Sampling2,Sampling3,Sampling4,Sampling5
M1_LogisticRegression,92.67,94.24,92.93,93.79,90.85
M2_KNN,98.17,98.43,97.91,99.02,95.75
M3_DecisionTree,100.0,99.74,98.69,99.35,99.35
M4_RandomForest,100.0,100.0,100.0,100.0,100.0
M5_SVM,98.69,98.95,99.21,98.69,96.41


In [30]:
best_sampling_each_model = results.idxmax(axis=1)

best_sampling_df = pd.DataFrame({
    "Best_Sampling_Technique": best_sampling_each_model
})

best_sampling_df


Unnamed: 0,Best_Sampling_Technique
M1_LogisticRegression,Sampling2
M2_KNN,Sampling4
M3_DecisionTree,Sampling1
M4_RandomForest,Sampling1
M5_SVM,Sampling3


In [31]:
average_scores = results.mean(axis=0)
overall_best_sampling = average_scores.idxmax()

overall_best_sampling


'Sampling2'