<a href="https://colab.research.google.com/github/Btere/btereml/blob/main/creditcard_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
DATASET_PATH = Path("/content/drive/MyDrive/Colab Notebooks")




In [None]:
def read_csv_files(dataset_path: Path)-> pd.DataFrame:
    train_dataset = pd.read_csv(f'{dataset_path}/train_fraud.csv', index_col=False)
    test_dataset = pd.read_csv(f'{dataset_path}/test_fraud.csv', index_col=False)
    return train_dataset, test_dataset

In [None]:
train_dataset, test_dataset = read_csv_files(DATASET_PATH)


In [None]:
display(train_dataset)

In [None]:
# Dataset overview

train_dataset.shape

In [None]:
test_dataset.shape

In [None]:
train_dataset.columns

In [None]:
train_dataset.info()

In [None]:
train_dataset.isnull().sum()

In [None]:
train_dataset.isna().sum()

In [None]:
display(test_dataset)

In [None]:
test_dataset.nunique()

In [None]:
train_dataset["is_fraud"].value_counts()

In [None]:
test_dataset["is_fraud"].value_counts()

Data cleaning

In [None]:
train_data = train_dataset.drop(columns='Unnamed: 0')
test_data = test_dataset.drop(columns='Unnamed: 0')

In [None]:
train_data["dob"] = pd.to_datetime(train_data["dob"])
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])

In [None]:
train_data

In [None]:
test_data["dob"] = pd.to_datetime(test_data["dob"])
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time'])

In [None]:
train_data.shape , test_data.shape

In [None]:
train_data.columns , test_data.columns

First, we want to apply some transformation to the dataset to normalize the features values before encoding the categorical labels.



counts the number of occurrences of each job title among the rows in the test_data DataFrame where the is_fraud column is 1. It helps in understanding the distribution of job titles specifically for fraudulent cases in the dataset.

In [None]:
test_data[test_data["is_fraud"] == 1]["job"].value_counts()

In [None]:
train_data[train_data["is_fraud"] == 1]["merchant"].value_counts()

In [None]:
# encoding test data
"""
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

test_data['merchant']=encoder.fit_transform(test_data['merchant'])
test_data['category']=encoder.fit_transform(test_data['category'])
test_data['street']=encoder.fit_transform(test_data['street'])
test_data['job']=encoder.fit_transform(test_data['job'])
test_data['trans_num']=encoder.fit_transform(test_data['trans_num'])
test_data['first']=encoder.fit_transform(test_data['first'])
test_data['city']=encoder.fit_transform(test_data['city'])
test_data['state']=encoder.fit_transform(test_data['state'])
test_data['last']=encoder.fit_transform(test_data['last'])
test_data['gender']=encoder.fit_transform(test_data['gender'])
test_data['trans_date_trans_time']=encoder.fit_transform(test_data['trans_date_trans_time'])
test_data['dob']=encoder.fit_transform(test_data['dob'])
"""

In [None]:
test_data.head()

Encoding Train dataset and applying some normalization to the train and test set.

In [None]:
train_data.head()


In [None]:
train_data_copy = train_data.copy()
test_data_copy = test_data.copy()

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def preprocess_data(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    """Encoding categorical columns and applying scaling and normalization"""

    # Identify categorical and numerical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    numerical_cols = df.select_dtypes(include=['number']).columns.drop(target_col)

    # Define transformations for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('minmax', MinMaxScaler()),   # Normalize numerical data
        ('standard', StandardScaler())  # Scale numerical data
    ])

    categorical_transformer = OneHotEncoder(drop='first')  # One-Hot Encode categorical data

    # Combine transformations using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Apply transformations
    df_processed = preprocessor.fit_transform(df)

    # Convert the transformed data back to a DataFrame
    df_processed = pd.DataFrame(df_processed, columns=preprocessor.get_feature_names_out())

    # Add the target column back to the DataFrame
    df_processed[target_col] = df[target_col].values

    return df_processed



In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

def preprocess_data(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    """Encoding categorical columns and applying scaling and normalization"""

    le = LabelEncoder()
    standard_scaler = StandardScaler()
    minmax_scaler = MinMaxScaler()

    # Encode categorical columns first to numerical
    categorical_cols = df.select_dtypes(include=['object']).columns
    for column in categorical_cols:
        df[column] = le.fit_transform(df[column])

    # Handle datetime columns
    datetime_cols = df.select_dtypes(include=['datetime']).columns
    for column in datetime_cols:
        df[column + '_year'] = df[column].dt.year
        df[column + '_month'] = df[column].dt.month
        df[column + '_day'] = df[column].dt.day
        df[column + '_hour'] = df[column].dt.hour
        df[column + '_minute'] = df[column].dt.minute
        df[column + '_second'] = df[column].dt.second

    df = df.drop(columns=datetime_cols)

    # Now the dataset only contains numerical data, so we apply scaling and normalization to all columns except target_col
    numerical_cols = df.select_dtypes(include=['number']).columns.drop(target_col)  # Exclude the target column

    # First, apply MinMaxScaler for normalization
    df[numerical_cols] = minmax_scaler.fit_transform(df[numerical_cols])
    # Then, apply StandardScaler for scaling
    df[numerical_cols] = standard_scaler.fit_transform(df[numerical_cols])

    return df


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

def preprocess_data(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    """encoding categorical columns and applying scaling and normalization"""

    le = LabelEncoder()
    standard_scaler = StandardScaler()
    minmax_scaler = MinMaxScaler()

    # Encode categorical columns first to numerical by selecting the columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    for column in categorical_cols:
        df[column] = le.fit_transform(df[column])


     # Handle datetime columns
    datetime_cols = df.select_dtypes(include=['datetime']).columns
    for column in datetime_cols:
        df[column + '_year'] = df[column].dt.year
        df[column + '_month'] = df[column].dt.month
        df[column + '_day'] = df[column].dt.day
        df[column + '_hour'] = df[column].dt.hour
        df[column + '_minute'] = df[column].dt.minute
        df[column + '_second'] = df[column].dt.second

    df = df.drop(columns=datetime_cols)

# Now the dataset only contains numerical data, so we apply scaling and normalization to all columns
    numerical_cols = df.select_dtypes(include=['number']).columns
    numerical_cols = df.drop(target_col, axis=1)

    # First, apply MinMaxScaler for normalization and then, apply StandardScaler for scaling
    df[numerical_cols] = minmax_scaler.fit_transform(df[numerical_cols])
    df[numerical_cols] = standard_scaler.fit_transform(df[numerical_cols])



    return df

In [None]:
train_set = preprocess_data(train_data, "is_fraud")
#train_set.head(10)
train_set["is_fraud"].value_counts()

In [None]:
test_set = preprocess_data(test_data_copy, "is_fraud")
test_set.head(10)

In [None]:
train_set.shape, test_set.shape

In [None]:
train_set.dtypes

In [None]:
train_set.describe()

In [None]:
test_set.describe()

In [None]:
def corr(df: pd.DataFrame, column: str) -> None:
  plt.figure(figsize = (10,10))
  sns.heatmap(train_data.corr(), cmap = "Reds", annot = True, fmt = ".1f")

In [None]:
#corr(train_data, "is_fraud")

In [None]:
def splitting_set(training_set: pd.DataFrame, testing_set: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    # Check if 'is_fraud' is in both sets
    if 'is_fraud' not in training_set.columns:
        raise ValueError("'is_fraud' column missing in training_set")
    if 'is_fraud' not in testing_set.columns:
        raise ValueError("'is_fraud' column missing in testing_set")

    # Split features and target
    X_train = training_set.drop(columns='is_fraud')
    y_train = training_set['is_fraud']


    X_test = testing_set.drop(columns='is_fraud')
    y_test = testing_set['is_fraud']

    # Print shapes for verification
    print("X_train shape:", X_train.shape)
    print("y_train shape:", y_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_test shape:", y_test.shape)

    return X_train, y_train, X_test, y_test




In [None]:
X_train, y_train, X_test, y_test = splitting_set(train_set, test_set)

In [None]:
print(np.unique(y_test))

In [None]:
test_set.columns.value_counts().sum()

In [None]:
print("Train columns:", train_set.columns)
print("Test columns:", test_set.columns)

To convert a Pandas DataFrame to a NumPy array, you can use the values attribute. This attribute returns a NumPy array containing the underlying data of the DataFrame. When you convert a Pandas DataFrame to a NumPy array, the rows and columns become rows and columns in the NumPy array.

The structure remains the same, but the data type changes from a Pandas Series (for columns) to a NumPy array (for the entire DataFrame). This can be beneficial for certain operations that are more efficient in NumPy, such as numerical computations.


It is a good practice to convert your train and test datast to numpy array before training the model with it.


NumPy arrays are optimized for numerical operations, making them more efficient for training machine learning models compared to Pandas DataFrames.
Many machine learning libraries, such as scikit-learn, expect the data to be in the form of NumPy arrays.
Using NumPy arrays ensures consistency in your data format, making it easier to manage and analyze your data.



In [None]:
#splitting train and test dataset and convert to a numpy array.

#def splitting_set(training_set: pd.DataFrame, testing_set: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:

  #X_train = training_set.loc[:, training_set.columns != 'is_fraud'].values
  #y_train = training_set.loc[:, 'is_fraud'].values

  #X_test = testing_set.loc[:, testing_set.columns != 'is_fraud'].values
  #y_test = testing_set.loc[:, 'is_fraud'].values

  #print(X_train.shape, y_train.shape)
  #print(X_test.shape, y_test.shape)

  #return X_train, y_train, X_test, y_test

In [None]:
#X_train, y_train, X_test, y_test = splitting_set(train_set, test_set)

In [None]:
#print(X_train.shape, y_train.shape)
#print(X_test.shape, y_test.shape)

In [None]:
#model building and training

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score


# Define all models
logistic_regression = LogisticRegression()
random_forest = RandomForestClassifier()
decision_tree = DecisionTreeClassifier()
svc = SVC()
knn = KNeighborsClassifier()
naive_bayes = GaussianNB()
gradient_boosting = GradientBoostingClassifier()


In NumPy, the reshape function is used to change the shape of an array without changing its data. The arguments (-1, 1) and (1, -1) specify how the array should be reshaped. Here’s a detailed explanation of each:

reshape(-1, 1)
-1: This is a special placeholder used in NumPy’s reshape method. It tells NumPy to automatically determine the size of this dimension based on the size of the array and the remaining dimensions.
1: This specifies that the resulting shape should have a single column.


Explanation: The -1 tells NumPy to infer the number of rows based on the total number of elements (which is 6 in this case) and the specified number of columns (1). So, the resulting shape is (6, 1).

reshape(1, -1)
1: This specifies that the resulting shape should have a single row.
-1: This tells NumPy to automatically determine the size of this dimension based on the size of the array and the remaining dimensions.


Explanation: The -1 tells NumPy to infer the number of columns based on the total number of elements (which is 6 in this case) and the specified number of rows (1). So, the resulting shape is (1, 6).

Summary
reshape(-1, 1) converts a 1D array into a 2D array with one column and as many rows as needed.
reshape(1, -1) converts a 1D array into a 2D array with one row and as many columns as needed.
The -1 in the reshape function is useful for automatically calculating dimensions when you only need to specify one of the dimensions, making it easier to reshape arrays without manually calculating the required sizes.

We want to track the cpu, gpu and memory usage of the algorithm, using a decorator, then using mlflow to track the logs.

Using different models for training and evaluating the model.

In [None]:
def train_model(model, Xtrain: np.ndarray, ytrain:np.ndarray):
    # Train the model
    model.fit(Xtrain, ytrain)


In [None]:
def predict(model, Xtest: np.ndarray)-> None:
    predictions = model.predict(Xtest)
    return predictions

In [None]:
def evaluate_model(model, y_test: np.ndarray, y_pred: np.ndarray):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    print(f"Model: {model.__class__.__name__}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model.__class__.__name__}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()



In [None]:
def main(model, Xtrain, ytrain, Xtest, ytest):
    train_model(model, Xtrain, ytrain)
    y_pred = predict(model, Xtest)
    evaluate_model(model, ytest, y_pred)

In [None]:
trained_model, y_pred, model_performance  = main(logistic_regression, X_train, y_train, X_test, y_test)

In [None]:
import pickle
#save model as pickle file

def save_model(model, model_name):
    with open(model_name, 'wb') as f:
        pickle.dump(model, f)


In [None]:
#load model

def load_model(model_name):
    with open(model_name, 'rb') as f:
        model = pickle.load(f)
    return model

In [None]:
# Function for Decision Tree
#def decision_tree_model(Xtrain, ytrain, Xtest):
    #train_model(decision_tree, Xtrain, ytrain, Xtest)

In [None]:
#decision_tree_model(Xtrain, ytrain, Xtest)

In [None]:
#decision_tree_model(Xtrain, ytrain, Xtest, ytest)

In [None]:

#