# File Flatenning

In [2]:
import os
import sys
import pandas as pd
import numpy as np
from scripts.csvtopaquet import csv_to_parquet_single_file

current_directory = os.getcwd()
csv4_file_path = os.path.join(current_directory, '01-data', 'FD_creditcard_data.csv')
csv4_file_path = os.path.join(current_directory, '01-data', 'FD_02_apl_train.csv')
output_file_path4 = os.path.join(current_directory, '01-data', 'FD_creditcard_data.parquet')
output_file_path5 = os.path.join(current_directory, '01-data', 'FD_02_apl_train.parquet')

'''
csv_to_parquet_single_file(csv_file_path=csv4_file_path, output_file_path=output_file_path4, chunksize=100000, sample_rows=None, drop_columns=None)
csv_to_parquet_single_file(csv_file_path=csv5_file_path, output_file_path=output_file_path5, chunksize=100000, sample_rows=None, drop_columns=None)
'''

'\ncsv_to_parquet_single_file(csv_file_path=csv4_file_path, output_file_path=output_file_path4, chunksize=100000, sample_rows=None, drop_columns=None)\ncsv_to_parquet_single_file(csv_file_path=csv5_file_path, output_file_path=output_file_path5, chunksize=100000, sample_rows=None, drop_columns=None)\n'

# loading Data

In [3]:
import os
import sys
import pandas as pd
import numpy as np

current_directory = os.getcwd()
parquetFile4 = os.path.join(current_directory, '01-data', 'FD_creditcard_data.parquet')
parquetFile5 = os.path.join(current_directory, '01-data', 'FD_02_apl_train.parquet')
df4 = pd.read_parquet(parquetFile4)
df5 = pd.read_parquet(parquetFile5)

# Model comparsions (FD_creditcard_data)

In [4]:
from scripts.models import logistic_regression_model, train_and_evaluate_decision_tree,train_and_evaluate_random_forest,train_and_evaluate_knn,train_and_evaluate_gaussian_nb
_=logistic_regression_model(df4, target_column='Class', drop_columns=['id'], add_constant=True, return_type='Summary')
_= train_and_evaluate_decision_tree(df4, 'Class', ['id'],test_size=0.3, random_state=42, return_accuracy_only=True, top_n_features=20)
_= train_and_evaluate_random_forest(df4, 'Class', ['id'], test_size=0.3, random_state=42, return_accuracy_only=True, top_n_features=20)
_= train_and_evaluate_knn(df4, 'Class', ['id'], n_neighbors=5, return_accuracy_only=True)
_= train_and_evaluate_gaussian_nb(df4, 'Class', ['id'], return_accuracy_only=True)

  from .autonotebook import tqdm as notebook_tqdm


Decision Tree Accuracy: 99.80%
Random Forest Accuracy: 99.93%
KNN Accuracy: 99.67%
Gaussian Naive Bayes Accuracy: 98.28%


**Insights**:
- The comparison between different models for the `FD_creditcard_data` dataset shows a clear distinction in model performance, with Random Forest achieving the highest accuracy at 99.93%, followed closely by Decision Tree and KNN models. The Gaussian Naive Bayes model, however, lags behind significantly with an accuracy of 98.28%.
- This performance disparity suggests that ensemble methods like Random Forest are more adept at handling the complexities and nuances of the credit card fraud detection dataset, likely due to their ability to model non-linear relationships and interactions between variables more effectively than simpler models like Gaussian Naive Bayes.
- The logistic regression details further substantiate the importance of feature selection and the impact of different variables on predicting fraudulent transactions. For instance, the variables with significant p-values (e.g., V14, V19, Amount) indicate a stronger relationship with the outcome (Class), highlighting the necessity for careful feature engineering and selection in improving model accuracy.



### Logistic Regression details

In [5]:
import pandas as pd
from scripts.models import logistic_regression_model
summary=logistic_regression_model(df4, target_column='Class', drop_columns=['id'], add_constant=True, return_type='Summary')
print(summary)

                           Logit Regression Results                           
Dep. Variable:                  Class   No. Observations:                 5050
Model:                          Logit   Df Residuals:                     5020
Method:                           MLE   Df Model:                           29
Date:                Sun, 31 Mar 2024   Pseudo R-squ.:                  0.8852
Time:                        15:14:52   Log-Likelihood:                -32.196
converged:                       True   LL-Null:                       -280.51
Covariance Type:            nonrobust   LLR p-value:                 1.421e-86
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const           -10.4734      2.538     -4.127      0.000     -15.448      -5.499
age              -0.0421      0.313     -0.134      0.893      -0.656       0.572
maritalstatus    -1.1720      0.623     

# Numeric and Dummy Varibles dataseta (FD_02_apl_train)


## Clean Data

In [6]:
from scripts.dataclean import preprocess_for_lightgbm
processed_data = preprocess_for_lightgbm(df5, 'SK_ID_CURR')

Sample of the processed data saved to: /Users/danramirez/mbs-fraud-detection/02-output/testmerge.csv


## Model comparsions

In [7]:
from scripts.models import train_and_evaluate_decision_tree,train_and_evaluate_random_forest,train_and_evaluate_knn,train_and_evaluate_gaussian_nb
print('')
print('')

_= train_and_evaluate_decision_tree(processed_data, 'TARGET', [],test_size=0.3, random_state=42, return_accuracy_only=True, top_n_features=20)
_= train_and_evaluate_random_forest(processed_data, 'TARGET', [], test_size=0.3, random_state=42, return_accuracy_only=True, top_n_features=20)
_= train_and_evaluate_knn(processed_data, 'TARGET', [], n_neighbors=5, return_accuracy_only=True)
_= train_and_evaluate_gaussian_nb(processed_data, 'TARGET', [], return_accuracy_only=True)



Decision Tree Accuracy: 85.20%


## Model Details review

- The `FD_02_apl_train` dataset underwent several model comparisons, showcasing diverse performance across various algorithms. Notably, Random Forest emerged as the leading model with an accuracy of 91.97%, closely followed by Gaussian Naive Bayes and KNN, both demonstrating high accuracy levels above 91%. This indicates a strong ability of these models to generalize well over the dataset.
- The decision tree model displayed a comparatively lower accuracy of 85.20%. This divergence in performance may highlight the decision tree's sensitivity to the specific characteristics of the dataset, such as its feature distribution or the presence of complex, non-linear relationships that ensemble methods like Random Forest can better capture.
- A deeper analysis into the classification reports reveals critical insights. For instance, the decision tree model's classification report illustrates its capacity to achieve high precision and recall for the majority class but faces challenges with the minority class. This pattern suggests a potential overfitting to the majority class or a need for more nuanced feature engineering to improve minority class predictions.
- Random Forest's classification report exhibits high precision for the majority class but a total inability to predict the minority class correctly, as indicated by a recall of 0. This could point to the model's overemphasis on the majority class, likely due to class imbalance. It underscores the importance of employing techniques like class weight adjustment, oversampling, or undersampling to enhance the model's sensitivity towards the minority class.
- The comparative analysis also underlines the significance of model evaluation beyond mere accuracy metrics. It highlights the need to consider a model's performance in terms of precision, recall, and the f1-score to ensure a balanced predictive capability across all classes, which is crucial for applications like fraud detection where the minority class (fraudulent transactions) is of particular interest.
- These insights suggest that while Random Forest offers the highest accuracy for the `FD_02_apl_train` dataset, there remains room for improvement, especially in handling class imbalance and enhancing minority class prediction. Future efforts could explore more sophisticated ensemble techniques, advanced feature engineering, and fine-tuning of model parameters to achieve a more equitable performance across classes.

These insights are critical for understanding the strengths and limitations of various models applied to the `FD_02_apl_train` dataset, guiding future modeling efforts towards achieving not only high accuracy but also balanced precision and recall across classes.


### Decision Tree

In [None]:
from scripts.models import logistic_regression_model,train_and_evaluate_decision_tree,train_and_evaluate_random_forest,train_and_evaluate_knn,train_and_evaluate_gaussian_nb

train_and_evaluate_decision_tree(processed_data, target_column='TARGET', drop_columns=[], return_accuracy_only=False)

### Random Forest

In [None]:
from scripts.models import logistic_regression_model,train_and_evaluate_decision_tree,train_and_evaluate_random_forest,train_and_evaluate_knn,train_and_evaluate_gaussian_nb
train_and_evaluate_random_forest(processed_data, target_column='TARGET', drop_columns=[], return_accuracy_only=False)


### KNN and gaussian

In [None]:
train_and_evaluate_knn(processed_data, 'TARGET', [], n_neighbors=5, return_accuracy_only=False)
train_and_evaluate_gaussian_nb(processed_data, 'TARGET', [], return_accuracy_only=False)