In [None]:
import os
import sys
my_path = "/kaggle/input/mlsecu" # path to utils folder
sys.path.append(my_path)
from mlsecu.anomaly_detection_use_case import *
from mlsecu.data_exploration_utils import *
from mlsecu.data_preparation_utils import *
from sklearn.ensemble import IsolationForest
import pandas as pd
import matplotlib.pyplot as plt

### Loading Dataset

In [None]:
TRAIN_TRANSACTION_FILE_PATH = os.path.join('/kaggle/input/ieee-fraud-detection','train_transaction.csv')

In [None]:
transaction_df = pd.read_csv(TRAIN_TRANSACTION_FILE_PATH)

In [None]:
transaction_df.head()

In [None]:
non_fraud_df = transaction_df[transaction_df['isFraud'] == 0][:9700]
fraud_df = transaction_df[transaction_df['isFraud'] == 1][:300]

In [None]:
print(f'{non_fraud_df.shape = }')
non_fraud_df.head()

In [None]:
print(f'{fraud_df.shape = }')
fraud_df.head()

In [None]:
reduced_transaction_df = pd.concat([non_fraud_df, fraud_df], ignore_index=True)
print(f'{reduced_transaction_df.shape = }')
reduced_transaction_df.head()

In [None]:
column_number = len(reduced_transaction_df.columns)
print(f'{column_number=}')
print(5*'-')

In [None]:
column_names = list(reduced_transaction_df.columns)
print("column_names")
for col in column_names:
    types = reduced_transaction_df[col].dtypes
    print(col,'has type',types,end = "\t")
    

In [None]:
dataset_length = len(reduced_transaction_df)
print(f'{dataset_length=}')

7. Control the correct extraction of transaction: number of non-fraudulent transactions, number of fraudulent transactions, rate of fraudulent transactions

In [None]:
non_fradulent_number = len(reduced_transaction_df[reduced_transaction_df['isFraud'] == 0])
fradulent_number = len(reduced_transaction_df[reduced_transaction_df['isFraud'] == 1])
fraudulent_rate = fradulent_number /(fradulent_number+non_fradulent_number)
print(f'Number of  non-frauds {non_fradulent_number}')
print(f'Number of frauds {fradulent_number}')
print(f'Fraud rate {fraudulent_rate}')

8. Which columns are categories? List them (show all column names); extract and show existing values.

In [None]:
categorical_columns = ['ProductCD', 'card1', 'card2', 'card3','card4','card5','card6',
                       'addr1', 'addr2',
                       'P_emaildomain','R_emaildomain', 'M1','M2', 'M3', 'M4','M5', 'M6', 'M7', 'M8', 'M9']
print(f'{categorical_columns  =  }')

9. Which columns are numeric? List them (show all column names); extract and show min, max,mean, median and standard deviation values.


In [None]:
is_num = lambda type_ : pd.api.types.is_numeric_dtype(type_)
numeric_columns = []
for col in column_names:
    if is_num(reduced_transaction_df[col].dtypes):
        numeric_columns.append(col)

print("Numeric columns are", numeric_columns)

10. For each column, print the rate of undefined values (NaN for numeric)

In [None]:
undefined_rates = reduced_transaction_df.isna().mean()

# Printing the rate of undefined values for each column
for col,rate in zip(reduced_transaction_df.columns,undefined_rates):
    print(col,rate, end='\t')

11. For each numerical column, print the rate of zero (0) value

In [None]:
zero_rates = (reduced_transaction_df == 0).mean()
for col,rate in zip(reduced_transaction_df.columns,zero_rates):
    print(col,rate, end='\t')


## DataSet visualisation

12. Visualise the dataset using dimensions: 'TransactionAmt', 'card1','addr1'

In [None]:

def show3D_transation_data(transac_dataset, x_axis_name, y_axis_name, z_axis_name,title='Data visualisation',color='blue',label=None):
    fig = plt.figure(figsize=(10,10))
    fig.set_facecolor('white')
    ax = fig.add_subplot(1,1,1, projection='3d')

    # Utilisez scatter pour créer le nuage de points en 3D
    ax.scatter(transac_dataset[x_axis_name], transac_dataset[y_axis_name], transac_dataset[z_axis_name],color=color,label=label)

    # Définissez les étiquettes des axes
    ax.set_xlabel(x_axis_name)
    ax.set_ylabel(y_axis_name)
    ax.set_zlabel(z_axis_name)
    ax.legend()
    ax.set_title(title)
    plt.tight_layout()
    plt.show()
    return
show3D_transation_data(reduced_transaction_df,'TransactionAmt', 'card1','addr1')

13. Create an alternate visualisation function for visualising fraud entries only, in red

In [None]:
def show3D_transation_data_fraud_only(transac_dataset, x_axis_name, y_axis_name, z_axis_name):
    show3D_transation_data(transac_dataset[transac_dataset['isFraud'] ==1 ],'TransactionAmt', 'card1','addr1',title='Frauds',color='red')
    
show3D_transation_data_fraud_only(reduced_transaction_df,'TransactionAmt', 'card1','addr1')

## Data Cleaning

14. Perform one-hot encoding of categorical data

In [None]:
reduced_transaction_df.shape

In [None]:
categorical_columns

In [None]:
print(f'{reduced_transaction_df[categorical_columns].shape = }')
one_hot_df = get_one_hot_encoded_dataframe(reduced_transaction_df[categorical_columns]) 
print(f'{one_hot_df.shape = }')
one_hot_df.head()

15. Remove NaN (Not a number) values by imputation of the mean of the column
- Do not forget to transform the resulting data structure in Pandas Dataframe again
- Do not forget to keep the column names for the Dataframe

In [None]:
transformed_df = remove_nan_through_mean_imputation(one_hot_df)
print(f'{transformed_df.shape = }')
transformed_df.head()

In [None]:
def generate_transformed_df(dataframe: pd.DataFrame,selected_cols:list[str]):
    one_hot_df = get_one_hot_encoded_dataframe(reduced_transaction_df[selected_cols])
    transformed_df = remove_nan_through_mean_imputation(one_hot_df)
    return transformed_df

16. Control that no NaN value remain in the dataframe

In [None]:
if np.all(transformed_df.isna().mean() == 0):
    print("No more Nan value in the data frame")

## Outlier detection

17. Extract outliers using sklearn.ensemble.IsolationForests, using and outliers_fraction = 0.03. Control the numbers of outliers.
*Typs:*
- clf.predict() returns outliers marked as -1
- Add an additional column ‘outliers’ to your dataframe as follows to store outlier status:
- How many outliers found in unsupervised manner are labelled as fraudulent transactions (`isFraud`) ?
- Print fraudulent outliers. What do you observe?

How many outliers found in unsupervised manner are labelled as fraudulent transactions (`isFraud`) ?

In [None]:
outliers_indexes = get_list_of_if_outliers(reduced_transaction_df,outlier_fraction=0.03)
outliers = np.ones(len(reduced_transaction_df))
outliers[outliers_indexes] = -1
reduced_transaction_df['outliers'] = outliers

In [None]:
fraudulent_outliers = reduced_transaction_df[(reduced_transaction_df['outliers']== -1) & (reduced_transaction_df['isFraud']== 1)]
outliers_matches = len(fraudulent_outliers)
print(f'{outliers_matches} outliers found in unsupervised manner are labelled as fraudulent transactions')

- Print fraudulent outliers. What do you observe?

In [None]:
fraudulent_outliers

We can observe that fraudulent outliers are not well detected.

18. Create yet another visualisation function for visualising IsolationForest outliers entries only, in red
`show3D_transation_data_if_outliers_only(transac_dataset, x_axis_name, y_axis_name,z_axis_name)`

In [None]:
def show3D_transation_data_if_outliers_only(transac_dataset, x_axis_name, y_axis_name,z_axis_name):
    show3D_transation_data(transac_dataset, x_axis_name, y_axis_name, z_axis_name,title='Isolation forest outliers',color='red')
sub_df = reduced_transaction_df[reduced_transaction_df['outliers']== -1]
show3D_transation_data_if_outliers_only(sub_df,'TransactionAmt', 'card1','addr1')

## Local Outlier Factor (LOF)

19. Extract outliers using sklearn.neighbors. LocalOutlierFactor, using and outliers_fraction = 0.03. Control the numbers of outliers.


`clf.predict()` returns outliers marked as -1
- Add an additional column ‘lof_outliers’ to your dataframe as follows to store outlier status:
- How many outliers found in unsupervised manner are labelled as fraudulent transactions ('isFraud') ?

In [None]:
outliers_indexes = get_list_of_lof_outliers(reduced_transaction_df,outlier_fraction =0.03)
reduced_transaction_df['lof_outliers'] = np.ones(len(reduced_transaction_df))
reduced_transaction_df['lof_outliers'][outliers_indexes] = -1 

In [None]:
lof_outliers_matches = len(reduced_transaction_df[(reduced_transaction_df['lof_outliers']== -1) & (reduced_transaction_df['isFraud']== 1)])
print(f'{outliers_matches} lof outliers found in unsupervised manner are labelled as fraudulent transactions')

20. Create yet another visualisation function for visualising LOF outliers entries only, in red `show3D_transation_data_lof_outliers_only(transac_dataset, x_axis_name, y_axis_name,z_axis_name)`



In [None]:
def show3D_transation_data_lof_outliers_only(transac_dataset, x_axis_name, y_axis_name,z_axis_name):
    return show3D_transation_data(transac_dataset, x_axis_name, y_axis_name, z_axis_name,title='LOF outliers',color='red',label='lof outlier')
sub_df = reduced_transaction_df[reduced_transaction_df['lof_outliers']== -1]
show3D_transation_data_lof_outliers_only(sub_df,'TransactionAmt', 'card1','addr1')

21. Control the complementarity between the 2 algorithms <br>
    21.1. How many outliers are common to IsolationForest and LOF? <br>
    21.2. What do you deduce for building unsupervised outlier detectors?

In [None]:
outliers_mask = (reduced_transaction_df['lof_outliers']== -1) & (reduced_transaction_df['outliers']== -1)
common_outliers = len(reduced_transaction_df[outliers_mask])
common_fraud_outliers = len(reduced_transaction_df[outliers_mask & (reduced_transaction_df['isFraud'] == 1)])
print(f'Ther are {common_outliers} common outliers between LOF and IF and {common_fraud_outliers} of those are real frauds')