<a name='1'></a>

## 1 - Import necessary libraries and modules

In [28]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tempfile, urllib, zipfile
import tensorflow_data_validation as tfdv
import tensorflow_decision_forests as tfdf

from sklearn.model_selection import train_test_split
from tensorflow.python.lib.io import file_io
from tensorflow_data_validation.utils import slicing_util
from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList, DatasetFeatureStatistics
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import joblib

# Set TF's logger to only display errors to avoid internal warnings being shown
tf.get_logger().setLevel('ERROR')
print("succesfully handled imports")
pd.show_versions()


succesfully handled imports

INSTALLED VERSIONS
------------------
commit           : 2e218d10984e9919f0296931d92ea851c6a6faf5
python           : 3.10.14.final.0
python-bits      : 64
OS               : Darwin
OS-release       : 23.1.0
Version          : Darwin Kernel Version 23.1.0: Mon Oct  9 21:27:27 PDT 2023; root:xnu-10002.41.9~6/RELEASE_X86_64
machine          : x86_64
processor        : i386
byteorder        : little
LC_ALL           : None
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 1.5.3
numpy            : 1.26.4
pytz             : 2024.1
dateutil         : 2.9.0.post0
setuptools       : 65.5.0
pip              : 24.1.1
Cython           : None
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 5.2.2
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 3.1.2
IPython          : 7.34.0
pandas_da

<a name='2'></a>

## 2 - Load the credit worthiness dataset

In [29]:
# Read CSV data into a dataframe and recognize the missing data that is encoded with '?' string as NaN
df = pd.read_csv('dataset/Loan_default.csv', header=0, na_values = '?')
# Preview the dataset
df.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


<a name='1'></a>

## 2 - Split the dataset for the training into PRODUCTION, EVALUATION, SERVING sets.
The split ratio will be as follows:\
PRODUCTION: 70%\
SERVING: 15%\
EVALUATION: 15%\
For the serving dataset we will drop the Default column as it is the label for which we want to predict the value\

In [30]:
def prepare_data_splits_from_dataframe(df):
    '''
    Splits a Pandas Dataframe into training, evaluation and serving sets.

    Parameters:
            df : pandas dataframe to split

    Returns:
            train_df: Training dataframe(70% of the entire dataset)
            eval_df: Evaluation dataframe (15% of the entire dataset) 
            serving_df: Serving dataframe (15% of the entire dataset, label column dropped)
    '''
    train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['Default'])
    # Then, split temp into evaluation and serving
    # eval_df=temp_df; serving_df = temp_df
    eval_df, serving_df = train_test_split(temp_df, test_size=0.5, random_state=48, stratify=temp_df['Default'])
 
    # Serving data emulates the data that would be submitted for predictions, so it should not have the label column.
    # serving_df = serving_df.drop(['Default'], axis=1)

    return train_df, eval_df, serving_df

Running the split function to seperate into the different datasets


In [31]:
train_df, eval_df, serving_df = prepare_data_splits_from_dataframe(df)
print('Training dataset has {} records\nValidation dataset has {} records\nServing dataset has {} records'.format(len(train_df),len(eval_df),len(serving_df)))

Training dataset has 178742 records
Validation dataset has 38302 records
Serving dataset has 38303 records


<a name='3'></a>
## 3 - Removing unneeded features and generating statistics options
These features we expect to not impact the final outcome of the training hence their removal

In [32]:
# Define features to remove in dataframe
features_to_remove = {'LoanID'}
approved_cols = [col for col in df.columns if (col not in features_to_remove)]
stats_options = tfdv.StatsOptions(feature_allowlist=approved_cols)
print(stats_options.feature_allowlist)

['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner', 'Default']


<a name='4'></a>

## 4 - Generate the training statistics

In [33]:
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=train_df, stats_options=stats_options)

# validate section

# get the number of features used to compute statistics
print(f"Number of features used: {len(train_stats.datasets[0].features)}")

# check the number of examples used
print(f"Number of examples used: {train_stats.datasets[0].num_examples}")

# check the column names of the first and last feature
print(f"First feature: {train_stats.datasets[0].features[0].path.step[0]}")
print(f"Last feature: {train_stats.datasets[0].features[-1].path.step[0]}")

Number of features used: 18
Number of examples used: 178742
First feature: Age
Last feature: __index_level_0__


<a name='6'></a>
## 6 - Visualize the training statistics

In [34]:
tfdv.visualize_statistics(lhs_statistics=train_stats,lhs_name='statistics for train-stats')

<a name='7'></a>
## 7 - Infer a Data schema

In [35]:
schema = tfdv.infer_schema(statistics=train_stats)

# Display the data schema
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Age',INT,required,,-
'Income',INT,required,,-
'LoanAmount',INT,required,,-
'CreditScore',INT,required,,-
'MonthsEmployed',INT,required,,-
'NumCreditLines',INT,required,,-
'InterestRate',FLOAT,required,,-
'LoanTerm',INT,required,,-
'DTIRatio',FLOAT,required,,-
'Education',STRING,required,,'Education'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Education',"'Bachelor\'s', 'High School', 'Master\'s', 'PhD'"
'EmploymentType',"'Full-time', 'Part-time', 'Self-employed', 'Unemployed'"
'MaritalStatus',"'Divorced', 'Married', 'Single'"
'HasMortgage',"'No', 'Yes'"
'HasDependents',"'No', 'Yes'"
'LoanPurpose',"'Auto', 'Business', 'Education', 'Home', 'Other'"
'HasCoSigner',"'No', 'Yes'"


<a name='8'></a>
## 8 - Calculate, Visualize and Fix Training and Evaluation Anomalies


In [36]:
## getting the stats for our evaluation dataframe
eval_stats = tfdv.generate_statistics_from_dataframe(eval_df, stats_options=stats_options)
tfdv.visualize_statistics(lhs_statistics=eval_stats, rhs_statistics=train_stats,
                          lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

# TEST CODE

# get the number of features used to compute statistics
print(f"Number of features: {len(eval_stats.datasets[0].features)}")

# check the number of examples used
print(f"Number of examples: {eval_stats.datasets[0].num_examples}")

# check the column names of the first and last feature
print(f"First feature: {eval_stats.datasets[0].features[0].path.step[0]}")
print(f"Last feature: {eval_stats.datasets[0].features[-1].path.step[0]}")

Number of features: 18
Number of examples: 38302
First feature: Age
Last feature: __index_level_0__


<a name='8'></a>

## 8 - Fix anomalies in training and evaluation frames if there are any


In [37]:
def calculate_and_display_anomalies(type, statistics, schema):
    '''
    Calculate and display anomalies.

            Parameters:
                    statistics : Data statistics in statistics_pb2.DatasetFeatureStatisticsList format
                    schema : Data schema in schema_pb2.Schema format

            Returns:
                    display of calculated anomalies
    '''
    tfdv.visualize_statistics(lhs_statistics=statistics, rhs_statistics=train_stats,
                          lhs_name='SERVE_DATASET', rhs_name='TRAIN_DATASET')
    # HINTS: Pass the statistics and schema parameters into the validation function 
    anomalies = tfdv.validate_statistics(statistics=statistics, schema=schema)
    
    # HINTS: Display input anomalies by using the calculated anomalies
    print('The Dataframe is for: ' + type)
    tfdv.display_anomalies(anomalies=anomalies)
    ### END CODE HERE

# Check evaluation data for errors by validating the evaluation data staticss using the previously inferred schema
calculate_and_display_anomalies('evaluation_anomalies', eval_stats, schema=schema)
calculate_and_display_anomalies('training_anomalies', train_stats, schema=schema)

The Dataframe is for: evaluation_anomalies


The Dataframe is for: training_anomalies


<a name='9'></a>

## 8 - Fix anomalies in serving data frames if there are any


In [38]:
serving_options = tfdv.StatsOptions(schema=schema, 
                            infer_type_from_schema=True, 
                            feature_allowlist=approved_cols)
serving_stats = tfdv.generate_statistics_from_dataframe(serving_df, stats_options=serving_options)
calculate_and_display_anomalies('serving_anomalies', serving_stats, schema=schema)

schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

tfdv.get_feature(schema, 'Default').not_in_environment.append('SERVING')

serving_anomalies_with_env = tfdv.validate_statistics(statistics=serving_stats, schema=schema, environment='SERVING')
print('Detect any missing column anomalies after excluding from environment')
tfdv.display_anomalies(serving_anomalies_with_env)

The Dataframe is for: serving_anomalies


Detect any missing column anomalies after excluding from environment


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Default',Column missing in environment,New column Default found in data but not in the environment SERVING in the schema.


<a name='10'></a>
## 10 - Freeze the schema

In [None]:
OUTPUT_DIR = "output"
file_io.recursive_create_dir(OUTPUT_DIR)

# Use TensorFlow text output format pbtxt to store the schema
schema_file = os.path.join(OUTPUT_DIR, 'credit_worthiness_schema.pbtxt')

# write_schema_text function expect the defined schema and output path as parameters
tfdv.write_schema_text(schema, schema_file)  