In [1]:
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from google.colab import files
import zipfile
import logging
import time
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, learning_curve
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, average_precision_score, confusion_matrix, precision_recall_curve
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.manifold import TSNE
import google.colab as colab
from google.colab import drive



import xgboost as xgb
import shap
import os
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [2]:
#Create directory for saving plots.
def setup_plot_dir():
  plt_dir = 'plots'
  if not os.path.exists(plt_dir):
    os.makedirs(plt_dir)
  return plt_dir


In [3]:
#Plot class distribution (fraud vs non-fraud).
def plot_class_distribution(y, plot_dir):
   plt.figure(figsize=(6,4))
   sns.countplot(x=y,palette='Set2')
   plt.title('Class Distribution', fontsize=14)
   plt.xlabel('Class', fontsize=14)
   plt.ylabel('Count', fontsize=14)
   plt.savefig(f"{plot_dir}/class_distribution.png", bbox_inches='tight')
   plt.close()
   logging.info("Class distribution plot saved.")

In [4]:
#Plot boxplots for specified features by class.
def plot_feature_boxplots(df, features, plot_dir):
  plt.figure(figsize=(10, 6))
  for i, feature in enumerate(features, 1):
    plt.subplot(1, len(features), i)
    sns.boxplot(x='Class', y=feature, data=df, palette='Set2')
    plt.title(f"{feature} Distribution")
  plt.tight_layout()
  plt.savefig(f"{plot_dir}/feature_boxplots.png", bbox_inches='tight')
  plt.close()
  logging.info("Feature boxplots saved.")


In [5]:
#Plot correlation matrix heatmap.
def plot_correlation_matrix(df, plot_dir):
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.corr(), cmap='coolwarm', annot=False)
    plt.title("Correlation Matrix")
    plt.savefig(f"{plot_dir}/correlation_matrix.png", bbox_inches='tight')
    plt.close()
    logging.info("Correlation matrix plot saved.")

In [6]:
#Plot t-SNE for fraud vs non-fraud clustering.
def plot_tsne(X, y, plot_dir):
    start = time.time()
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X.sample(1000, random_state=42))  # Sample for speed
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y.sample(1000, random_state=42), cmap='Set2')
    plt.legend(*scatter.legend_elements(), title="Classes")
    plt.title("t-SNE Clustering")
    plt.savefig(f"{plot_dir}/tsne.png", bbox_inches='tight')
    plt.close()
    logging.info(f"t-SNE plot saved. Time: {time.time() - start:.2f}s")

In [7]:
#Plot precision-recall curve.
def plot_precision_recall(y_test, y_proba, model_name, plot_dir):
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    plt.figure(figsize=(6, 4))
    plt.plot(recall, precision, label=f"PR-AUC: {average_precision_score(y_test, y_proba):.2f}")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision-Recall Curve ({model_name})")
    plt.legend()
    plt.savefig(f"{plot_dir}/precision_recall_{model_name}.png", bbox_inches='tight')
    plt.close()
    logging.info(f"Precision-recall curve for {model_name} saved.")

In [8]:
#Plot confusion matrix heatmap.
def plot_confusion_matrix(y_test, y_pred, model_name, plot_dir):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f"Confusion Matrix ({model_name})")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.savefig(f"{plot_dir}/confusion_matrix_{model_name}.png", bbox_inches='tight')
    plt.close()
    logging.info(f"Confusion matrix for {model_name} saved.")

In [9]:
# Plot learning curve to check overfitting.
def plot_learning_curve(model, X, y, model_name, plot_dir):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y, cv=3, scoring='f1', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5)
    )
    plt.figure(figsize=(6, 4))
    plt.plot(train_sizes, train_scores.mean(axis=1), label="Training F1")
    plt.plot(train_sizes, val_scores.mean(axis=1), label="Validation F1")
    plt.xlabel("Training Examples")
    plt.ylabel("F1 Score")
    plt.title(f"Learning Curve ({model_name})")
    plt.legend()
    plt.savefig(f"{plot_dir}/learning_curve_{model_name}.png", bbox_inches='tight')
    plt.close()
    logging.info(f"Learning curve for {model_name} saved.")

In [10]:
# Mount Google Drive
drive.mount('/content/drive')

# Paths
zip_path = "/content/drive/MyDrive/Colab Notebooks/creditcard.csv.zip"
extract_dir = "/content/temp_extract"  # Temporary folder to avoid conflict

# Create extract folder if not exists
os.makedirs(extract_dir, exist_ok=True)

# Extract ZIP into temp folder
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Extraction complete! Files are in:", extract_dir)

# Find the CSV file inside the extracted folder
for file_name in os.listdir(extract_dir):
    if file_name.endswith(".csv"):
        csv_path = os.path.join(extract_dir, file_name)
        break

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Extraction complete! Files are in: /content/temp_extract


In [11]:
# Load dataset
def load_data(file_path):
    df = pd.read_csv(file_path)
    print("Shape of dataset:", df.shape)
    print(df.head())
    return df

# Call function
df = load_data(csv_path)



Shape of dataset: (284807, 31)
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

 

In [12]:
df['Class'].value_counts()


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


In [13]:
def preprocess_data(df, plot_dir):
    scaler = RobustScaler()

    # Create X and y
    X = df.drop('Class', axis=1)
    y = df['Class']

    # Scale 'Amount' and 'Time' columns in X
    X['scaled_amount'] = scaler.fit_transform(X['Amount'].values.reshape(-1, 1))
    X['scaled_time'] = scaler.fit_transform(X['Time'].values.reshape(-1, 1))

    # Drop original 'Amount' and 'Time' columns from X
    X = X.drop(['Amount', 'Time'], axis=1)

    # Now split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Plot class distribution
    print("Plotting class distribution...")
    plot_class_distribution(y, plot_dir)
    print("Done class distribution.")

    # Optional: Plot feature boxplots and correlation matrix
    plot_df = X.copy()
    plot_df['Class'] = y.values
    plot_feature_boxplots(plot_df, ['V10', 'V12', 'V14'], plot_dir)
    plot_correlation_matrix(plot_df, plot_dir)

    # t-SNE plot (optimized for speed)
    print("Starting t-SNE plot...")
    sample_size = min(500, len(X))  # smaller sample size for speed
    sample_X = X.sample(sample_size, random_state=42)
    sample_y = y.loc[sample_X.index]

    from sklearn.manifold import TSNE
    import time

    start_time = time.time()
    tsne = TSNE(
        n_components=2,
        random_state=42,
        n_iter=300,          # fewer iterations for faster run (default is 1000)
        perplexity=30,
        n_jobs=-1            # use all CPUs (if sklearn supports it in your version)
    )
    X_tsne = tsne.fit_transform(sample_X)

    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=sample_y, cmap='Set2', alpha=0.7)
    plt.legend(*scatter.legend_elements(), title="Classes")
    plt.title("t-SNE Clustering (sampled)")
    plt.savefig(f"{plot_dir}/tsne.png", bbox_inches='tight')
    plt.close()
    print(f"Completed t-SNE plot in {time.time() - start_time:.2f} seconds.")

    print(f"Train set: {X_train.shape[0]} samples | Test set: {X_test.shape[0]} samples")
    print(f"Fraud distribution: Train {y_train.mean():.4f} | Test {y_test.mean():.4f}")

    return X_train, X_test, y_train, y_test


In [14]:
#Remove outliers using IQR on specified features for fraud class
def remove_outliers(df, features, multiplier=1.5):
    fraud_df = df[df['Class'] == 1]
    for feature in features:
        q25, q75 = np.percentile(fraud_df[feature], 25), np.percentile(fraud_df[feature], 75)
        iqr = q75 - q25
        lower, upper = q25 - (iqr * multiplier), q75 + (iqr * multiplier)
        fraud_df = fraud_df[(fraud_df[feature] >= lower) & (fraud_df[feature] <= upper)]

    df = pd.concat([df[df['Class'] == 0], fraud_df], axis=0)
    print(f"Outliers removed. New shape: {df.shape}")
    return df


In [15]:
#Train model with optional SMOTE and tuning; evaluate with plots
def train_and_evaluate(model, X_train, y_train, X_test, y_test,
                       model_name, use_smote=False, param_grid=None, plot_dir=None):
    start_time = time.time()

    steps = []
    if use_smote:
        steps.append(('smote', SMOTE(random_state=42)))
    steps.append(('model', model))
    pipeline = Pipeline(steps)

    if param_grid:
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        pipeline = grid_search.best_estimator_
        print(f"Best params for {model_name}: {grid_search.best_params_}")
    else:
        pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    if hasattr(pipeline, "predict_proba"):
        y_proba = pipeline.predict_proba(X_test)[:, 1]
    elif hasattr(pipeline, "decision_function"):
        y_proba = pipeline.decision_function(X_test)
    else:
        y_proba = y_pred


    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)

    print(f"{model_name} Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Precision: {precision:.4f}")

    print(f"\nClassification Report for {model_name}:\n{classification_report(y_test, y_pred)}")
    print(f"PR-AUC for {model_name}: {average_precision_score(y_test, y_proba):.4f}")
    print(f"Confusion Matrix for {model_name}:\n{confusion_matrix(y_test, y_pred)}")

    # Plots
    plot_precision_recall(y_test, y_proba, model_name, plot_dir)
    plot_confusion_matrix(y_test, y_pred, model_name, plot_dir)
    plot_learning_curve(pipeline, X_train, y_train, model_name, plot_dir)

    print(f"Training time for {model_name}: {time.time() - start_time:.2f}s")
    return pipeline, y_proba


In [16]:
import shap
import matplotlib.pyplot as plt
import logging

def explain_model(model, X_test, model_name, plot_dir):
    # Get the trained estimator from the pipeline
    estimator = model.named_steps['model']

    # Sample fewer rows for speed
    sample_X = X_test.sample(min(200, len(X_test)), random_state=42)

    # Use TreeExplainer for speed (works for RF, XGB, LightGBM)
    explainer = shap.TreeExplainer(estimator)
    shap_values = explainer.shap_values(sample_X)

    plt.figure()
    shap.summary_plot(shap_values, sample_X, show=False)
    plt.savefig(f"{plot_dir}/shap_summary_{model_name}.png", bbox_inches='tight')
    plt.close()
    print(f"SHAP summary plot for {model_name} saved at {plot_dir}.")


In [17]:
def main(file_path):
    plot_dir = setup_plot_dir()
    df = load_data(file_path)

    # Outlier removal
    df = remove_outliers(df, features=['V10', 'V12', 'V14'])

    # SAMPLE a smaller subset for faster processing (e.g., 20,000 rows)
    df_small = df.sample(20000, random_state=42)

    # Now preprocess and split the smaller dataset
    X_train, X_test, y_train, y_test = preprocess_data(df_small, plot_dir)

    # Models with class weights and tuning
    models = {
        'XGBoost': (
            xgb.XGBClassifier(
                random_state=42,
                eval_metric='logloss',
                scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1])
            ),
            {
                'model__n_estimators': [50, 100],
                'model__max_depth': [3, 5]
            }
        ),
        'RandomForest': (
            RandomForestClassifier(
                class_weight='balanced',
                random_state=42
            ),
            {
                'model__n_estimators': [100, 200],
                'model__max_depth': [None, 10, 20]
            }
        )
    }

    for name, (model, param_grid) in models.items():
        print(f"\n--- Training {name} with SMOTE ---")
        pipeline, y_proba = train_and_evaluate(
            model, X_train, y_train, X_test, y_test,
            name, use_smote=True, param_grid=param_grid, plot_dir=plot_dir
        )
        explain_model(pipeline, X_test, name, plot_dir)

# Just call main with your CSV file path
main(csv_path)


Shape of dataset: (284807, 31)
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

 

<Figure size 640x480 with 0 Axes>

In [18]:
total = len(df)
fraud_count = df['Class'].sum()
non_fraud_count = total - fraud_count

print(f"Total samples: {total}")
print(f"Fraud cases: {fraud_count} ({fraud_count/total:.6f}%)")
print(f"Non-fraud cases: {non_fraud_count} ({non_fraud_count/total:.6f}%)")


Total samples: 284807
Fraud cases: 492 (0.001727%)
Non-fraud cases: 284315 (0.998273%)


In [21]:
!pip freeze > requirements.txt


In [20]:
!git clone https://github.com/Ayishathesni001a/Custome_card_prediction.git


Cloning into 'Custome_card_prediction'...


In [22]:
%cd Custome_card_prediction


/content/Custome_card_prediction


In [23]:
!git status


On branch main

No commits yet

nothing to commit (create/copy files and use "git add" to track)
