# Pre Processing Data for ML Training

## Introduction

This notebook is dependent on the execution of `scripts/02 - exploratory_data_analysis.py`

## Imports and Definitions

In [18]:
import pandas as pd
import sqlite3
import numpy as np
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib
import yaml
import os

print("Defining Classes")

class Logger:
    def __init__(self, config):
        self.config = config
        self.log_dir = config['logging']['out-dir']
        self.tag = config['base']['tag']
        self.file_path = os.path.join('outputs', self.tag, self.log_dir, 'log.txt')
        self.verbose = config['logging']['verbose']
        
    def log(self, message):
        current_datetime = datetime.datetime.now()
        datetime_string = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
        log_message = f"{datetime_string}: {message}"
        if self.verbose:
            print(log_message)
        with open(self.file_path, "a") as f:
            f.write(f'{log_message}\n')

print("Defining Functions")

def loan_status_to_int(status):
    if status == 'Charged Off':
        return 0
    if status == 'Fully Paid':
        return 1
    else:
        return -1

def bestbandwidth(data):
    return 1.06*np.std(data)*len(data)**(-1/5)

def create_directory_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        return f"Created directory: {directory}"
    else:
        return f"Directory already exists: {directory}"

def transform_type(sqlite_type):
    if sqlite_type == 'INTEGER':
        return 'int'
    if sqlite_type == 'REAL':
        return 'float'
    if sqlite_type == 'TEXT':
        return 'object'

def map_dtype_to_sqlite(col_type):
    if col_type.startswith('int') or col_type == 'bool':
        return 'INTEGER'
    elif col_type.startswith('float'):
        return 'REAL'
    else:  # Default case, particularly for 'object' and other unhandled types
        return 'TEXT'

print("Reading Config File")

config_file_path = '../config/config.yml'

root_path = '..'

print(f"Reading Config File {config_file_path}")
with open(config_file_path, 'r') as f:
    config = yaml.safe_load(f)

print("Defining Variables and Creating Directories")

sqlite_file = config['data']['output_sqlite']

tag = config['base']['tag']

git_repo = config['base']['git_repo']

fontsize = config['plotting']['fontsize']
figsize_x = config['plotting']['figure_xsize']
figsize_y = config['plotting']['figure_ysize']

out_dir_figures = f"outputs/{tag}/figures"

out_dir_stats = f"outputs/{tag}/stats"

out_dir_log = f"outputs/{tag}/log"

sqlite_file = os.path.join(f'{root_path}/outputs/{tag}/data/{sqlite_file}')
out_dir_figures = os.path.join(root_path, out_dir_figures)
out_dir_stats = os.path.join(root_path, out_dir_stats)
out_dir_log = os.path.join(root_path, out_dir_log)

columns_of_interest = config['base']['columns_of_interest']

print("Done with initial setup")

Defining Classes
Defining Functions
Reading Config File
Reading Config File ../config/config.yml
Defining Variables and Creating Directories
Done with initial setup


## Loading and Cleaning Data and Defining Columns

In [2]:
print("Loading Data")
# Defining the connection to the database
conn = sqlite3.connect(sqlite_file)

# Loading descriptions into dataframe
description_fetch_query = f"""SELECT *
                    FROM descriptions
                    """
descriptions = pd.read_sql_query(description_fetch_query, conn, index_col = 'name')

column_types = {idx:transform_type(row['data_type']) for idx, row in descriptions.iterrows() if row['location'] == 'loans_data' and idx in columns_of_interest}

# Loading data into dataframe
data_fetch_query = f"""SELECT {', '.join(columns_of_interest)} 
                       FROM loans_data
                       ORDER BY RANDOM()"""

loans_data = pd.read_sql_query(data_fetch_query, conn, index_col='id', dtype=column_types)

# Closing connection
conn.close()

print("Filtering known bad columns")

loans_data = loans_data[loans_data['issue_d_unix'] != 0]

print("Creating columns")

loans_data['issue_d'] = pd.to_datetime(loans_data['issue_d_unix'], unit='s')
descriptions = pd.concat([descriptions, pd.DataFrame({
    'name':['issue_d'],
    'full_name': ['Issue Date'],
    'type': ['Column'],
    'location': ['loans_data'],
    'description': ['Date the loan was issued'],
    'data_type': ['TEXT']
}).set_index('name')])

loans_data['issue_month'] = loans_data['issue_d'].apply(lambda x: x.month)
descriptions = pd.concat([descriptions, pd.DataFrame({
    'name':['issue_month'],
    'full_name': ['Issue Month'],
    'type': ['Column'],
    'location': ['loans_data'],
    'description': ['Month of the year the loan was issued'],
    'data_type': ['INT']
}).set_index('name')])

print("Limiting dataset to two types of Loan Status only")

loans_data = loans_data[(loans_data['loan_status'] == 'Charged Off') | (loans_data['loan_status'] == 'Fully Paid')]

print("Done with Loading and Cleaning Data and Defining Columns")

Loading Data
Filtering known bad columns
Creating columns
Limiting dataset to two types of Loan Status only
Done with Loading and Cleaning Data and Defining Columns


## Pre-Processing Data

### Preparing Numerical and Dummy Columns, and Dropping Unecessary Ones

In [15]:
print("Declaring Numerical Columns")
numerical_columns = ['loan_amnt', 'int_rate', 'installment', 'annual_inc', 
                     'dti', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
                     'mort_acc', 'pub_rec_bankruptcies']

print("Declaring Dummy Columns")
dummy_columns = ['term_months', 'sub_grade', 'home_ownership', 'verification_status',
                 'loan_status', 'purpose', 'initial_list_status', 'application_type']

print("Declaring Columns to Drop")
drop_columns = ['grade', 'issue_d', 'issue_d_unix']

print("Dropping Non-Interesting Columns")
loans_data_ML = loans_data.drop(drop_columns, axis='columns')
print("Creating Dummy Columns")
loans_data_ML = pd.get_dummies(loans_data_ML, columns=dummy_columns, drop_first=True)

print("Done Preparing Numerical and Dummy Columns, and Dropping Unecessary Ones")
loans_data_ML

Declaring Numerical Columns
Declaring Dummy Columns
Declaring Columns to Drop
Dropping Non-Interesting Columns
Creating Dummy Columns
Done Preparing Numerical and Dummy Columns, and Dropping Unecessary Ones


Unnamed: 0_level_0,loan_amnt,int_rate,installment,annual_inc,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,...,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_w,application_type_Joint App
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1862525,18000,20.49,481.820007,41589.0,31.360001,16,0,15350,67.000000,46,...,False,False,False,False,False,False,False,False,False,False
188973,18000,12.29,600.359985,45000.0,28.959999,16,0,33751,67.000000,26,...,False,False,False,False,False,False,False,False,True,False
326533,2000,13.99,68.349998,29000.0,26.700001,10,0,1358,18.600000,29,...,False,False,False,False,False,False,False,False,True,False
262582,8800,7.89,275.320007,50000.0,13.370000,4,0,7692,52.700001,13,...,False,False,False,False,False,False,False,False,True,False
120364,12725,13.33,291.690002,55000.0,23.280001,13,1,9745,54.700001,27,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88166,10000,6.24,305.309998,65000.0,8.220000,20,0,9078,15.400000,44,...,False,False,False,False,False,False,False,False,True,False
1912890,7200,12.12,239.559998,52000.0,17.910000,13,0,8301,75.300003,18,...,False,False,False,False,False,False,False,False,False,False
1886463,6000,8.90,190.520004,145000.0,13.270000,14,0,42656,84.599998,30,...,False,False,False,False,False,False,False,False,False,False
1938390,14000,15.27,487.170013,53700.0,19.280001,4,0,9707,97.099998,24,...,False,False,False,False,False,False,False,False,False,False


### Saving ML Prepared Data to new table

In [19]:
print(f"Connecting to Database at {sqlite_file}")
conn = sqlite3.connect(sqlite_file)

print("Creating Queries")
drop_loans_data_ML_query = 'DROP TABLE IF EXISTS loans_data_ML'

# Query to create a table for the loans data
create_loans_data_ML_table_query = 'CREATE TABLE loans_data_ML (' + ', '.join([f"\"{col}\" {col_type}" for col, col_type in zip(loans_data_ML.columns, [map_dtype_to_sqlite(str(loans_data_ML[col].dtype)) for col in loans_data_ML.columns])]) + ')'

print("Dropping old tables and creating new ones")
# Drops and creates the tables
conn.execute(drop_loans_data_ML_query)
conn.execute(create_loans_data_ML_table_query)

print("Loading data into tables")
# Insert data from DataFrame to the SQLite table
loans_data_ML.to_sql('loans_data_ML', conn, if_exists='replace', index=False)

conn.close()
print("Done With Constructing the Database File")

Connecting to Database at ../outputs/prototype/data/All_Lending_Club_Loan_2007_2018.sqlite
Creating Queries
Dropping old tables and creating new ones
Loading data into tables
Done With Constructing the Database File
