# Day 2 
[![Author - DanRamirez](https://img.shields.io/badge/Author-DanRamirez-2ea44f?style=for-the-badge)](https://github.com/Dandata0101)
![Python - Version](https://img.shields.io/badge/PYTHON-3.11-red?style=for-the-badge&logo=python&logoColor=white)
[![Project Repo](https://img.shields.io/badge/Our_Project_Repo-Visit-blue?style=for-the-badge&logo=github&logoColor=white)](https://github.com/Dandata0101/mbs-fraud-detection)

I used LightGBM to explain Top impacting features to the **TARGET** variable for the training and Test Application Files. Before producing any analysis, I converted csv files in Parquet files to flatten and reduce the size of files by more than half.

#  flattening file sizes

In [None]:
# select at least 3 variables for this analysis

import os
import sys
from scripts.csvtopaquet import csv_to_parquet_single_file

current_directory = os.getcwd()
csv1_file_path = os.path.join(current_directory, '01-data', 'FD_02_apl_test.csv')
csv2_file_path = os.path.join(current_directory, '01-data', 'FD_02_apl_train.csv')
csv3_file_path = os.path.join(current_directory, '01-data', 'FD_02_previos_appl.csv')

output_file_path1 = os.path.join(current_directory, '01-data', 'FD_02_apl_test.parquet')
output_file_path2 = os.path.join(current_directory, '01-data', 'FD_02_apl_train.parquet')
output_file_path3 = os.path.join(current_directory, '01-data', 'FD_02_previos_appl.parquet')



csv_to_parquet_single_file(csv_file_path=csv1_file_path, output_file_path=output_file_path1, chunksize=100000, sample_rows=None, drop_columns=None)
csv_to_parquet_single_file(csv_file_path=csv2_file_path, output_file_path=output_file_path2, chunksize=100000, sample_rows=None, drop_columns=None)
csv_to_parquet_single_file(csv_file_path=csv3_file_path, output_file_path=output_file_path3, chunksize=100000, sample_rows=None, drop_columns=None)


# Import Parquet Files and assign DFs

In [None]:
import pandas as pd
import numpy as np
import sys,os

current_directory = os.getcwd()
parquetFile1 = os.path.join(current_directory, '01-data', 'FD_02_apl_test.parquet')
parquetFile2 = os.path.join(current_directory, '01-data', 'FD_02_apl_train.parquet')
parquetFile3 = os.path.join(current_directory, '01-data', 'FD_02_previos_appl.parquet')

df1 = pd.read_parquet(parquetFile1)
df1 = df1.fillna(0)
df1.columns = df1.columns.str.replace('[^a-zA-Z0-9_]', '_')
df1.columns = df1.columns.str.replace('/', '_')

print('test Data:')
print(df1.dtypes)
print('')

df2 = pd.read_parquet(parquetFile2)

df2 = df2.fillna(0)
df2.columns = df2.columns.str.replace('[^a-zA-Z0-9_]', '_')
df2.columns = df2.columns.str.replace('/', '_')

print('train Data:')
print(df2.dtypes)
print('')

df3 = pd.read_parquet(parquetFile3)
df3 = df3.fillna(0)
df3.columns = df3.columns.str.replace('[^a-zA-Z0-9_]', '_')
df3.columns = df3.columns.str.replace('/', '_')
print(df3.dtypes)
print('')
# Load the datasets
test  = df1
train = df2

current_data = pd.concat([train], axis=0)

print('Prior Data:')
print(df3.dtypes)
previous_data = df3
print('')

# Target Distribution

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Plotting setup
plt.figure(figsize=(30, 12))

# Horizontal Bar Chart for 'TARGET' value counts
plt.subplot(1, 2, 1)
target_counts = current_data['TARGET'].value_counts()
bars = target_counts.plot(kind='barh', color=['#9e7edf', '#FFD700'])
for index, value in enumerate(target_counts):
    # Shadow effect for text
    plt.text(value, index, str(value), va='center', ha='right', color='gray', fontsize=12, alpha=0.8, fontweight='bold')
    plt.text(value-1000, index, str(value), va='center', ha='right', color='black', fontsize=12, fontweight='bold')  # Actual text

# Pie Chart for 'TARGET' value counts
plt.subplot(1, 2, 2)
colors = ['#9e7edf', '#FFD700']
def autopct_format(values):
    def my_format(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%\n({v:d})'.format(p=pct,v=val)
    return my_format

target_counts.plot(kind='pie', colors=colors, autopct=autopct_format(target_counts), startangle=140, shadow=True, textprops={'fontsize': 12, 'fontweight': 'bold'})

plt.ylabel('')  # Hide the 'TARGET' label on y-axis for the pie chart
plt.title('TARGET Variable Distribution')

plt.tight_layout()  # Adjust layout to not overlap subplots
plt.show()


# Filter by Column Types

In [None]:
import pandas as pd


# Then proceed with training your LightGBM model
dataset=current_data

print('dataset:',dataset['TARGET'].dtypes)
# Splitting the DataFrame into object and numeric DataFrames
current_data_object = dataset.select_dtypes(include=['object'])
Current_data_Numericonly = dataset.select_dtypes(include=['number'])  # This includes int, float, etc.
current_data_object.columns.tolist()
Current_data_Numericonly.columns.tolist()
columns_data = dataset.columns.values.tolist()
dummy_columns = pd.get_dummies(current_data_object, dtype=int)


merged_data = pd.concat([Current_data_Numericonly, dummy_columns], axis=1)
# Drop 'SK_ID_CURR' and columns with blank names
merged_data.drop(columns=['SK_ID_CURR'], inplace=True)
def clean_column_names(df):
    df.columns = [col.replace('{', '')
                     .replace('}', '')
                     .replace('[', '')
                     .replace(']', '')
                     .replace('"', '')
                     .replace(':', '')
                     .replace(',', '') for col in df.columns]
    return df

# Clean the column names of your DataFrame
merged_data = clean_column_names(merged_data)

print(merged_data.dtypes)

sample=merged_data.head(1)
print(merged_data.head(1).T)

sample.to_csv('testmerge.csv')

print(Current_data_Numericonly.dtypes)

# To see the columns of each DataFrame
print("Object Columns:", len(current_data_object.columns.tolist()))
print("Numeric Columns:", len(Current_data_Numericonly.columns.tolist()))

columns_data[(columns_data == 'current_data')]

# Top Factors

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
merged_data
merged_data = merged_data.fillna(0)
# Assuming your DataFrame and target variable setup
X = merged_data.drop('TARGET', axis=1)
y = merged_data['TARGET']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Parameters
params = {
    'objective': 'binary', 
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
}

# Early stopping callback
early_stopping_callback = lgb.early_stopping(stopping_rounds=10)

# Training the model
num_round = 100
bst = lgb.train(
    params,
    train_data,
    num_boost_round=num_round,
    valid_sets=[test_data],
    callbacks=[early_stopping_callback]
)

# Prediction
y_pred_proba = bst.predict(X_test, num_iteration=bst.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred_proba]

# Accuracy
accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Accuracy: {accuracy}")


# Shap Values— Top Features

In [None]:
import shap

# You should replace 'model' with 'bst', which is your trained LightGBM booster
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(X_test)

# Now you can generate the SHAP summary plot with the correct SHAP values
shap.summary_plot(shap_values, X_test, plot_type="dot", show=False)

import matplotlib.pyplot as plt
plt.title("Impact of Features on Model Predictions with SHAP Values")
plt.gcf().set_size_inches(10, 8)
plt.show()


## SHAP Summary Plot Analysis for Fraud Detection Model

# SHAP Value Chart Summary

## Overview

The SHAP summary plot illustrates the impact of various features on a predictive model's output, indicating how each feature contributes to the model's predictions.

## Key Features by Impact

- **EXT_SOURCE_3**: Shows the most considerable impact on model predictions, generally contributing to higher prediction values when the feature value is high.
- **EXT_SOURCE_2 and EXT_SOURCE_1**: Also significant, these features exhibit a mix of high and low SHAP values.

## Observations on Feature Impact

- Features like `AMT_GOODS_PRICE`, `DAYS_EMPLOYED`, and `DAYS_BIRTH` demonstrate varied impacts on model output, suggesting complex interactions.
- The color intensity of the points (pink for higher values and blue for lower values) reflects the feature's value, indicating its influence on the prediction.

## Summary of Insights

External source features notably influence prediction outcomes, with the model heavily weighing these attributes. The variability in SHAP values for features like `AMT_GOODS_PRICE`, `DAYS_EMPLOYED`, and `DAYS_BIRTH` suggests that their impact on the model's predictions changes across different observations.

## Conclusion

The SHAP summary plot highlights external source features as key drivers in the model's predictive capabilities, with personal client attributes also providing substantial influence. These insights are valuable for understanding the model's behavior, guiding feature engineering efforts, and potentially enhancing model performance.


### Plot Interpretation
- The color of the dots (from blue to red) corresponds to the feature value from low to high.
- The positioning of the dots on the x-axis indicates the SHAP value's impact on the model's output.
- Red dots positioned to the right suggest that higher feature values contribute to an increased likelihood of fraud.
- Blue dots to the left suggest that lower feature values contribute to an increased likelihood of fraud.
- The center area, where blue and red dots overlap, shows where feature values do not clearly distinguish between fraud and non-fraud cases.


# Misc. Column charting

## Object Column Distribution

In [None]:

file = current_data
#file = file[file["TARGET"]==1.0]
    
def distributionGraphs(x):
    print("\033[1m\033[1;3mDistribution Based on "+str(x)+"\033[0m")
    print('Description : ',columns_data[(columns_data == 'current_data') & (columns_data == x)],'\n')
    
    #Create a normalized value count converted to percentage
    target_group = round(file.groupby('TARGET')[x].value_counts(normalize=True,sort=False)*100)
    
    #cnt = int((target_group.count()))
    cnt = int((target_group.count()/2))
        
    all_colr = ['#F38181','#FCE38A','#EAFFD0','#95E1D3','#EEEEEE','#00ADB5']
    colors = all_colr[:cnt]
    plt.figure(figsize=(30,6))
    
    plt.subplot(121)
    plt.title(str(x)+' Distribution grouped by Target')
    ax = target_group.plot(kind='bar',color=colors)
    
    # A method to print values in Bar
    for bar in ax.patches:
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_y() + bar.get_height()/2, str(bar.get_height()))
    
    plt.subplot(122)
    plt.title(str(x)+' distribution in Overall Records')
    (current_data[x].value_counts(normalize=True)*100).plot(kind='pie', autopct = "%1.0f%%", colors=colors)
    plt.ylabel(' ')

    plt.show()
    
    print("\033[1m\033[1;3mTable View\033[0m")
    print(target_group)
    print("\n\n")

for feature in current_data_object[1:5]: #add column list numbers to control [1:2]
    distributionGraphs(feature)


## Numeric Column Distribution

In [None]:

# =============================================================================
# NUMERICAL ANALYSIS
# =============================================================================
##REMOVING THE COLUMNS WITH NAME STARTING WITH"FLAG"
#Creating list of Flag columns and removing from numeric columns
data_numeric_cols = list(Current_data_Numericonly)
data_flag_cols = [val for val in data_numeric_cols if 'FLAG_' in val]

# remove flags from numeric cols
for val in data_flag_cols:
    data_numeric_cols.remove(val)

data_numeric_cols

#REMOVING ALSO REGION AND CITY FROM NUMERIC
data_area_cols = [val for val in data_numeric_cols if ('REGION_' in val or 'CITY_' in val )]

for val in data_area_cols:
    data_numeric_cols.remove(val)
data_numeric_cols

#REMOVING HOUR AND MINUTE
data_days_cols = [val for val in data_numeric_cols if ('DAYS_' in val or 'HOUR_' in val )]

for val in data_days_cols:
    data_numeric_cols.remove(val)
data_numeric_cols



#REMOVING ALL COLUMNS RELATED TO AMOUNT
data_amt_cols = [val for val in data_numeric_cols if 'AMT_' in val]

for val in data_amt_cols:
    data_numeric_cols.remove(val)

#LISTING THE VARIABLES DELETED FROM OUR NUMERICAL COLUMN
print('Numeric cols',len(data_numeric_cols))
print('Area cols',len(data_area_cols))
print('Flag cols',len(data_flag_cols))
print('Days cols',len(data_days_cols))
print('Amount cols',len(data_amt_cols))

#REMOVING THE TARGET AND CONSUMER ID 
#list(data_numeric_cols).remove('SK_ID_CURR')
data_numeric_cols.remove('TARGET')

# Let's see how values are distributed in numeric cols
current_data_info = round(current_data[data_numeric_cols+data_amt_cols+data_days_cols].describe().T,2)
current_data_info['description'] = current_data_info.index.map(lambda x: columns_data[(columns_data == 'application_data') & (columns_data == x)])
current_data_info

# We have verified the distribution during the data cleaning stage. We'll create a function now to visualize the distribution of numeric values

def numericDistributionGraph(col):
    print("\033[1m\033[1;3mDistribution Based on "+str(col)+"\033[0m")
    print('Description : ',columns_data[(columns_data == 'application_data') & (columns_data == col)])
    
    plt.figure(figsize=(18,10))
    
    plt.subplot(1,3,1)
    sns.histplot(current_data[col], bins=10, color='#222831')
    plt.title('Distribution of '+col+' Across current application', fontsize=8)
    
    plt.subplot(1,3,2)
    sns.boxplot(x='TARGET', y=col ,data=current_data,)
    plt.title('Distribution of '+col+' with respect to TARGET feature', fontsize=8)
    
    plt.subplot(1,3,3)
    sns.boxplot(x=current_data[col],color='#F38181')
    plt.title('Distribution of '+col+' quantile Across current application', fontsize=8)
    


for col in data_numeric_cols[2:3]:
    numericDistributionGraph(col)


# =============================================================================
#EXERCISE:
#    Use new features and make your analysis. 
# 
# =============================================================================

##Example of specific variables: Income external sources
def distGraphs(x):
    print("\033[1m\033[1;3mDistribution Based on "+str(x)+"\033[0m")
    print('Description : ',columns_data[(columns_data == 'current_data') & (columns_data == x)],'\n')
    
    #Create a normalized value count converted to percentage
    target_group = round(file.groupby('TARGET')[x].value_counts(normalize=True,sort=False)*100)
    
    plt.figure(figsize=(30,6))
    
    plt.subplot(121)
    plt.title(str(x)+' Distribution grouped by Target')
    ax = target_group.plot(kind='bar')
    
    for bar in ax.patches:
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_y() + bar.get_height()/2, str(bar.get_height()))
    
    plt.subplot(122)
    plt.title(str(x)+' distribution in Overall Records')
    (current_data[x].value_counts(normalize=True)*100).plot(kind='pie', autopct = "%1.0f%%")
    plt.ylabel(' ')

    plt.show()
    
    print("\033[1m\033[1;3mTable View\033[0m")
    print(target_group)
    print("\n\n")

plt.figure(figsize=(18,10))

plt.subplot(1,3,1)
sns.boxplot(x='TARGET', y='EXT_SOURCE_1' ,data=current_data,)
plt.title('Distribution of EXT_SOURCE_1 with respect to TARGET feature')

plt.subplot(1,3,2)
sns.boxplot(x='TARGET', y='EXT_SOURCE_2' ,data=current_data,)
plt.title('Distribution of EXT_SOURCE_2 with respect to TARGET feature')


plt.subplot(1,3,3)
sns.boxplot(x='TARGET', y='EXT_SOURCE_3' ,data=current_data,)
plt.title('Distribution of EXT_SOURCE_3 with respect to TARGET feature')

plt.show()

for col in data_flag_cols:
    distGraphs(col)

