# KK2 – Kreditkortsbedrägerier (Agnesa Bashota)
Minimal kod, auto-install, autosave av resultat till `logs/` och figurer till `images/`.

In [None]:

# Auto-installera nödvändiga paket om de saknas
import sys, subprocess
packages = ["scikit-learn", "pandas", "numpy", "matplotlib", "seaborn", "imbalanced-learn"]
for p in packages:
    try:
        __import__(p if p != "scikit-learn" else "sklearn")
    except ImportError:
        print(f"Installerar {p} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", p])


In [None]:

import os, warnings, json, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score, 
                             roc_curve, precision_recall_fscore_support, accuracy_score)
try:
    from imblearn.over_sampling import SMOTE
    SMOTE_AVAILABLE = True
except Exception:
    SMOTE_AVAILABLE = False
    warnings.warn("SMOTE saknas: använder klassvikter i modellerna istället.")

RNG = 42
np.random.seed(RNG)

os.makedirs("images", exist_ok=True)
os.makedirs("logs", exist_ok=True)


In [None]:

DATA_PATH = "data/creditcard.csv"
USE_SYNTH = not os.path.exists(DATA_PATH)

if USE_SYNTH:
    from sklearn.datasets import make_classification
    X, y = make_classification(n_samples=6000, n_features=30, n_informative=12, n_redundant=4,
                               weights=[0.995, 0.005], random_state=RNG)
    cols = [f"V{i}" for i in range(1, 31)]
    df = pd.DataFrame(X, columns=cols)
    df["Amount"] = np.abs(np.random.lognormal(mean=3.5, sigma=1.0, size=len(df)))
    df["Time"] = np.random.randint(0, 172800, size=len(df))
    df["Class"] = y
    print("Använder SYNTETISKT dataset (lägg Kaggle-filen i data/creditcard.csv för riktiga data).")
else:
    df = pd.read_csv(DATA_PATH)
    print("Laddade Kaggle-dataset.")

# Spara metainfo
with open("logs/meta.json", "w", encoding="utf-8") as f:
    json.dump({
        "created_utc": __import__("datetime").datetime.utcnow().isoformat()+"Z",
        "use_synthetic": bool(USE_SYNTH),
        "rows": int(df.shape[0]),
        "cols": int(df.shape[1])
    }, f, indent=2)

# Klassfördelning
ax = df['Class'].value_counts(normalize=True).sort_index().plot(kind='bar', title='Klassfördelning (andel)')
plt.tight_layout()
plt.savefig("images/class_distribution.png", dpi=150); plt.show()
df.shape


In [None]:

X = df.drop(columns=['Class'])
y = df['Class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RNG)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

pd.DataFrame({"set":["train","test"], "rows":[len(y_train), len(y_test)]})


In [None]:

if SMOTE_AVAILABLE:
    sm = SMOTE(random_state=RNG)
    X_tr, y_tr = sm.fit_resample(X_train_s, y_train)
    BAL = "SMOTE"
else:
    X_tr, y_tr = X_train_s, y_train
    BAL = "class_weight"
print("Balanseringsmetod:", BAL)


In [None]:

if BAL == "class_weight":
    logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RNG)
    rf = RandomForestClassifier(n_estimators=300, random_state=RNG, n_jobs=-1, class_weight='balanced_subsample')
    mlp = MLPClassifier(hidden_layer_sizes=(64,32), max_iter=120, random_state=RNG)
else:
    logreg = LogisticRegression(max_iter=1000, random_state=RNG)
    rf = RandomForestClassifier(n_estimators=300, random_state=RNG, n_jobs=-1)
    mlp = MLPClassifier(hidden_layer_sizes=(64,32), max_iter=120, random_state=RNG)

models = {"LogReg": logreg, "RandomForest": rf, "NeuralNet": mlp}
for name, m in models.items():
    m.fit(X_tr, y_tr)
    print("Tränad:", name)


In [None]:

def eval_model(model, X_te, y_te):
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_te)[:,1]
    elif hasattr(model, "decision_function"):
        proba = model.decision_function(X_te)
    else:
        proba = model.predict(X_te)
    pred = model.predict(X_te)
    acc = accuracy_score(y_te, pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_te, pred, average='binary', zero_division=0)
    auc = roc_auc_score(y_te, proba)
    rep = classification_report(y_te, pred, digits=3)
    return acc, prec, rec, f1, auc, pred, proba, rep

summary = []
store = {}
reports = {}
for name, m in models.items():
    acc, prec, rec, f1, auc, pred, proba, rep = eval_model(m, X_test_s, y_test)
    summary.append([name, acc, prec, rec, f1, auc])
    store[name] = {"pred": pred, "proba": proba}
    reports[name] = rep
    with open(f"logs/classification_report_{name}.txt", "w") as f:
        f.write(rep)

import pandas as pd
summary_df = pd.DataFrame(summary, columns=["Modell","Accuracy","Precision","Recall","F1","AUC"]).sort_values("F1", ascending=False)
summary_df.to_csv("logs/metrics.csv", index=False)
summary_df


In [None]:

plt.figure(figsize=(6,4))
for name in models.keys():
    from sklearn.metrics import roc_curve, roc_auc_score
    fpr, tpr, _ = roc_curve(y_test, store[name]["proba"])
    plt.plot(fpr, tpr, label=f"{name}")
plt.plot([0,1],[0,1],'--')
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC-kurvor"); plt.legend(); plt.tight_layout()
plt.savefig("images/roc_curves.png", dpi=150); plt.show()


In [None]:

best = summary_df.iloc[0]['Modell']
from sklearn.metrics import confusion_matrix
import seaborn as sns
cm = confusion_matrix(y_test, store[best]["pred"])
sns.heatmap(cm, annot=True, fmt='d', xticklabels=["Ej bedrägeri","Bedrägeri"], yticklabels=["Ej bedrägeri","Bedrägeri"])
plt.title(f"Confusion Matrix – {best}"); plt.tight_layout()
plt.savefig("images/confusion_matrix.png", dpi=150); plt.show()

best


In [None]:

from datetime import datetime
with open("logs/run_info.json", "w") as f:
    json.dump({"finished_utc": datetime.utcnow().isoformat()+"Z", "balancing": BAL}, f, indent=2)
print("Klar:", datetime.utcnow().isoformat()+"Z")
