In [62]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [63]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.5):
        self.threshold = threshold
        self.columns_to_drop_ = None

    def fit(self, X, y=None):
        null_ratio = X.isnull().mean()
        self.columns_to_drop_ = null_ratio[null_ratio > self.threshold].index.tolist()
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop_, errors="ignore")
    


In [52]:
path = "/kaggle/input/ieee-fraud-detection"
df = pd.read_csv(path + "/train_transaction.csv")

In [53]:
# Separate the classes
df_majority = df[df['isFraud'] == 0]
df_minority = df[df['isFraud'] == 1]

# Set your desired ratio here
desired_ratio = 3  # 3 for 3:1, 5 for 5:1

# Calculate how many majority samples we want
n_majority_samples = desired_ratio * len(df_minority)

# Downsample the majority class
df_majority_downsampled = df_majority.sample(n=n_majority_samples, random_state=42)

# Combine minority class with downsampled majority class
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Shuffle the resulting dataframe
df= df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("New class distribution:")
print(df_balanced['isFraud'].value_counts())


New class distribution:
isFraud
0    61989
1    20663
Name: count, dtype: int64


In [54]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [55]:
y = df['isFraud']
x = df.drop('isFraud', axis=1)
x.drop('TransactionID', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [59]:
from sklearn.compose import make_column_selector

numerical_cols = x.select_dtypes(include=['int64', 'float64']).columns.tolist()

categorical_cols = x.select_dtypes(include=['object']).columns.tolist()

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer()),  # Missing value handling
    ('scaler', StandardScaler())    # Feature scaling
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ]
)


full_pipeline = Pipeline([
    ('column_cleaner', ColumnDropper()),  
    
    ('preprocessor', ColumnTransformer([
        ('num', numeric_pipeline, make_column_selector(dtype_include=['int64', 'float64'])),
        ('cat', categorical_pipeline, make_column_selector(dtype_include=['object']))
    ])),
    
    ('model', RandomForestClassifier())
])

In [60]:
import category_encoders as ce
from scipy.stats import randint 
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler, RobustScaler

param_dist = {
    'column_cleaner__threshold': [0.0, 0.25, 0.5, 0.75],
    
    'preprocessor__num__imputer__strategy': ['mean', 'median', 'constant'],
    'preprocessor__num__imputer__fill_value': [-999, -1],  
    'preprocessor__num__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],

    'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant'],
    'preprocessor__cat__imputer__fill_value': ['missing', 'unknown'],
    'preprocessor__cat__encoder': [
        OneHotEncoder(handle_unknown='ignore'),
        ce.TargetEncoder(),
        ce.BinaryEncoder(),
        ce.CountEncoder()
    ],
    
    'model__n_estimators': [30,50,100],
    'model__max_depth': [4,7,10, 15,20],        
    'model__max_features': ['sqrt', 'log2'],
    'model__min_samples_split': [10,15,20],  
    'model__min_samples_leaf': [3,5,8],  
    'model__max_leaf_nodes': [20,50,70]   
}
kfold = KFold(n_splits=5, shuffle=True, random_state=42)


random_search = RandomizedSearchCV(
    full_pipeline,
    param_distributions=param_dist, 
    n_iter=10,  
    cv=kfold,
    scoring='roc_auc',
    verbose=2,
    n_jobs=-1,
    random_state=42  
)



In [64]:
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [65]:
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Best Parameters: {'preprocessor__num__scaler': MinMaxScaler(), 'preprocessor__num__imputer__strategy': 'constant', 'preprocessor__num__imputer__fill_value': -999, 'preprocessor__cat__imputer__strategy': 'most_frequent', 'preprocessor__cat__imputer__fill_value': 'unknown', 'preprocessor__cat__encoder': TargetEncoder(), 'model__n_estimators': 100, 'model__min_samples_split': 20, 'model__min_samples_leaf': 3, 'model__max_leaf_nodes': 70, 'model__max_features': 'sqrt', 'model__max_depth': 20, 'column_cleaner__threshold': 0.5}
Best Score: 0.8702599206541091


In [66]:
y_train_proba = random_search.predict_proba(X_train)[:, 1]  # Probabilities for class 1 (fraud)
y_test_proba = random_search.predict_proba(X_test)[:, 1]

train_auc = roc_auc_score(y_train, y_train_proba)
test_auc = roc_auc_score(y_test, y_test_proba)

print(f"Training ROC-AUC: {train_auc:.4f}")
print(f"Test ROC-AUC:     {test_auc:.4f}")

Training ROC-AUC: 0.8743
Test ROC-AUC:     0.8764


In [67]:
!pip install mlflow



In [68]:
!pip install dagshub



In [73]:
import mlflow
import dagshub

dagshub.init(repo_owner='CarlTeapot', repo_name='ML_Assignment2', mlflow=True)


name = "Random forest with k-fold cross validation for fraud detection2"


with mlflow.start_run(run_name = name):
        mlflow.log_params(random_search.best_params_)
        mlflow.sklearn.log_model(random_search.best_estimator_, "best_model") 
        mlflow.log_metric("Training ROC_AUC ", train_auc)
        mlflow.log_metric("Test ROC-AUC ", test_auc)
        print("Best Score:", random_search.best_score_)




Best Score: 0.8702599206541091
🏃 View run Random forest with k-fold cross validation for fraud detection2 at: https://dagshub.com/CarlTeapot/ML_Assignment2.mlflow/#/experiments/0/runs/71805affb66c4f14839fcbf371b8bc9b
🧪 View experiment at: https://dagshub.com/CarlTeapot/ML_Assignment2.mlflow/#/experiments/0
