In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from google.colab import files
from google.colab import drive
import sys
import os

In [2]:
def load_data_from_kaggle(dataset_name, kaggle_file_path='/content/datasets', kaggle_json_drive_path=None):
    """
    Downloads and loads data from Kaggle.

    Parameters:
        dataset_name (str): Kaggle dataset identifier (e.g., 'ealaxi/paysim1').
        kaggle_file_path (str): Path where the dataset should be downloaded and unzipped.
        kaggle_json_drive_path (str): Optional Google Drive path for kaggle.json to avoid repeated uploads.

    Returns:
        df (pd.DataFrame): Loaded DataFrame from the specified dataset.
    """
    # Ensure kaggle.json is available
    if kaggle_json_drive_path:
        # Mount Google Drive if needed
        drive.mount('/content/drive')
        kaggle_json_path = os.path.join('/content/drive/MyDrive', kaggle_json_drive_path)
        !cp {kaggle_json_path} ~/.kaggle/kaggle.json
    else:
        # Upload kaggle.json manually
        files.upload()

    # Set permissions and download dataset
    !chmod 600 ~/.kaggle/kaggle.json
    !kaggle datasets download -d {dataset_name} -p {kaggle_file_path} --unzip

    print(f"Dataset downloaded to: {kaggle_file_path}")

    # Load the dataset
    file_path = os.path.join(kaggle_file_path, "PS_20174392719_1491204439457_log.csv")
    df = pd.read_csv(file_path, delimiter=',', nrows=1000000)

    return df

def analyze_and_balance_data(df, target_column='isFraud', sampling_ratio=95/5, random_state=42):
    """
    Analyzes the dataset and balances it according to the specified sampling ratio.

    Parameters:
        df (pd.DataFrame): Original DataFrame.
        target_column (str): Target column for fraud classification.
        sampling_ratio (float): Ratio of non-fraud to fraud samples.
        random_state (int): Random seed for reproducibility.

    Returns:
        balanced_df (pd.DataFrame): Balanced dataset with fraud and non-fraud transactions.
    """
    print(f"Original dataset shape: {df.shape}")

    fraud_df = df[df[target_column] == 1]
    non_fraud_df = df[df[target_column] == 0]

    print(f"Fraud dataset size: {fraud_df.shape}")
    print(f"Non-Fraud dataset size: {non_fraud_df.shape}")

    desired_non_fraud_count = int(len(fraud_df) * sampling_ratio)
    non_fraud_sampled = non_fraud_df.sample(n=desired_non_fraud_count, random_state=random_state)

    balanced_df = pd.concat([fraud_df, non_fraud_sampled]).sample(frac=1, random_state=random_state).reset_index(drop=True)
    print(f"Balanced dataset shape: {balanced_df.shape}")

    print("Class balance after sampling:")
    print(balanced_df[target_column].value_counts(normalize=True) * 100)

    return balanced_df

def prepare_data_for_training(balanced_df, target_column='isFraud', top_features=None, test_size=0.2, random_state=42):
    """
    Prepares data for training by selecting features, scaling, and splitting into train and test sets.

    Parameters:
        balanced_df (pd.DataFrame): Balanced DataFrame with target and features.
        target_column (str): Target column for classification.
        top_features (list): List of top features to keep based on feature importance.
        test_size (float): Proportion of dataset to include in the test split.
        random_state (int): Random seed for reproducibility.

    Returns:
        X_train, X_test, y_train, y_test: Train and test sets for model training.
    """
    # One-hot encode categorical columns
    X = pd.get_dummies(balanced_df, columns=['type']).loc[:, ~balanced_df.columns.str.contains('name')]
    y = X[target_column]
    X = X.drop(columns=[target_column])

    # Feature selection using RandomForest
    model = RandomForestClassifier(random_state=random_state)
    model.fit(X, y)

    importances = model.feature_importances_
    feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': importances}).sort_values(by='importance', ascending=False)

    print("Top 10 features based on importance:")
    print(feature_importance_df.head(10))

    if top_features is None:
        top_features = feature_importance_df['feature'].head(3).tolist()

    X_top = X[top_features]

    # Scale features
    scaler = StandardScaler()
    X_top = scaler.fit_transform(X_top)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=test_size, random_state=random_state)

    print("Training set size:", X_train.shape, y_train.shape)
    print("Testing set size:", X_test.shape, y_test.shape)

    return X_train, X_test, y_train, y_test


In [None]:
def prepare_data_for_training(balanced_df, target_column='isFraud', top_features=None, test_size=0.2, random_state=42):
    """
    Prepares data for training by selecting features, scaling, and splitting into train and test sets.

    Parameters:
        balanced_df (pd.DataFrame): Balanced DataFrame with target and features.
        target_column (str): Target column for classification.
        top_features (list): List of top features to keep based on feature importance.
        test_size (float): Proportion of dataset to include in the test split.
        random_state (int): Random seed for reproducibility.

    Returns:
        X_train, X_test, y_train, y_test: Train and test sets for model training.
    """
    # One-hot encode categorical columns
    X = pd.get_dummies(balanced_df, columns=['type']).loc[:, ~balanced_df.columns.str.contains('name')]
    y = X[target_column]
    X = X.drop(columns=[target_column])

    # Feature selection using RandomForest
    model = RandomForestClassifier(random_state=random_state)
    model.fit(X, y)

    importances = model.feature_importances_
    feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': importances}).sort_values(by='importance', ascending=False)

    print("Top 10 features based on importance:")
    print(feature_importance_df.head(10))

    if top_features is None:
        top_features = feature_importance_df['feature'].head(3).tolist()

    X_top = X[top_features]

    # Scale features
    scaler = StandardScaler()
    X_top = scaler.fit_transform(X_top)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=test_size, random_state=random_state)

    print("Training set size:", X_train.shape, y_train.shape)
    print("Testing set size:", X_test.shape, y_test.shape)

    return X_train, X_test, y_train, y_test