<a href="https://colab.research.google.com/github/AnwarMohamedhyphen/datascience-notebooks/blob/main/Credit_card_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Credit Card Fraud Detection (CRISP-DM) - Google Colab Version\n",
    "---\n",
    "This notebook is fully formatted for Google Colab with separate cells for each CRISP-DM stage."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Install Optional Libraries (if not installed)"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "!pip install numpy pandas matplotlib scikit-learn joblib\n",
    "!pip install xgboost imbalanced-learn --quiet"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Import Libraries"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.pipeline import Pipeline\n",
    "import joblib\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Load or Generate Data"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "def load_data(csv_path='creditcard.csv', n_samples=120000):\n",
    "    if os.path.exists(csv_path):\n",
    "        print(f'Loading real dataset: {csv_path}')\n",
    "        df = pd.read_csv(csv_path)\n",
    "        return df\n",
    "    else:\n",
    "        print('No CSV found. Generating synthetic dataset...')\n",
    "        from sklearn.datasets import make_classification\n",
    "        X, y = make_classification(n_samples=n_samples, n_features=30, n_informative=15, n_redundant=5, weights=[0.995,0.005], class_sep=1.6, random_state=42)\n",
    "        cols = [f'V{i}' for i in range(1,29)] + ['Amount','Time']\n",
    "        df = pd.DataFrame(X, columns=cols)\n",
    "        df['Class'] = y.astype(int)\n",
    "        return df\n",
    "\n",
    "df = load_data()\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Explore Data (EDA)"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "print('Dataset shape:', df.shape)\n",
    "print('Class distribution:\n', df['Class'].value_counts(normalize=True))\n",
    "print('Missing values:\n', df.isna().sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Data Preparation"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "X = df.drop('Class', axis=1)\n",
    "y = df['Class']\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)\n",
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_scaled = scaler.transform(X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Model Training"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Logistic Regression\n",
    "logreg = LogisticRegression(max_iter=2000, class_weight='balanced')\n",
    "logreg.fit(X_train_scaled, y_train)\n",
    "\n",
    "# Random Forest\n",
    "rf = RandomForestClassifier(n_estimators=400, class_weight='balanced', n_jobs=-1, random_state=42)\n",
    "rf.fit(X_train_scaled, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Model Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "def evaluate_model(model, X_test, y_test, name):\n",
    "    y_pred = model.predict(X_test)\n",
    "    y_scores = model.predict_proba(X_test)[:,1] if hasattr(model,'predict_proba') else model.decision_function(X_test)\n",
    "    print('---', name, '---')\n",
    "    print('Confusion Matrix:\n', confusion_matrix(y_test,y_pred))\n",
    "    print('Classification Report:\n', classification_report(y_test,y_pred,digits=4))\n",
    "    fpr, tpr, _ = roc_curve(y_test, y_scores)\n",
    "    plt.figure()\n",
    "    plt.plot(fpr, tpr, label=f'ROC AUC = {roc_auc_score(y_test,y_scores):.4f}')\n",
    "    plt.plot([0,1],[0,1],'--')\n",
    "    plt.xlabel('False Positive Rate')\n",
    "    plt.ylabel('True Positive Rate')\n",
    "    plt.title(f'ROC Curve - {name}')\n",
    "    plt.legend()\n",
    "    plt.show()\n",
    "    precision, recall, _ = precision_recall_curve(y_test,y_scores)\n",
    "    plt.figure()\n",
    "    plt.plot(recall, precision, label=f'PR AUC = {average_precision_score(y_test,y_scores):.4f}')\n",
    "    plt.xlabel('Recall')\n",
    "    plt.ylabel('Precision')\n",
    "    plt.title(f'Precision-Recall Curve - {name}')\n",
    "    plt.legend()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "evaluate_model(logreg, X_test_scaled, y_test, 'Logistic Regression')\n",
    "evaluate_model(rf, X_test_scaled, y_test, 'Random Forest')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Save Best Model"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "best_model = rf  # choose the best by PR-AUC\n",
    "joblib.dump(best_model, 'best_fraud_model.joblib')\n",
    "print('Saved best model as best_fraud_model.joblib')"
   ]
  }
 ],
 "metadata": {"kernelspec": {"name": "python3", "display_name": "Python 3"}, "language_info": {"name": "python"}},
 "nbformat": 4,
 "nbformat_minor": 5
}

{'cells': [{'cell_type': 'markdown',
   'metadata': {},
   'source': ['# Credit Card Fraud Detection (CRISP-DM) - Google Colab Version\n',
    '---\n',
    'This notebook is fully formatted for Google Colab with separate cells for each CRISP-DM stage.']},
  {'cell_type': 'markdown',
   'metadata': {},
   'source': ['## 1. Install Optional Libraries (if not installed)']},
  {'cell_type': 'code',
   'metadata': {},
   'source': ['!pip install numpy pandas matplotlib scikit-learn joblib\n',
    '!pip install xgboost imbalanced-learn --quiet']},
  {'cell_type': 'markdown',
   'metadata': {},
   'source': ['## 2. Import Libraries']},
  {'cell_type': 'code',
   'metadata': {},
   'source': ['import os\n',
    'import numpy as np\n',
    'import pandas as pd\n',
    'import matplotlib.pyplot as plt\n',
    'from sklearn.model_selection import train_test_split\n',
    'from sklearn.preprocessing import StandardScaler\n',
    'from sklearn.metrics import classification_report, confusion_matrix,