## Importing the Packages

In [7]:
import numpy as np

In [8]:
np.set_printoptions(suppress = True, linewidth= 100, precision = 2)

## Importing the Dataset

In [9]:
# delimiter=';': Specifies the delimiter
#skip_header=1: Skips the first row of the CSV file
#autostrip=True: Trims leading and trailing whitespaces from the loaded data

raw_data = np.genfromtxt('loan-data.csv',
                         delimiter =';',
                         skip_header = 1,
                         autostrip = True)
raw_data

array([[48010226.  ,         nan,    35000.  , ...,         nan,         nan,     9452.96],
       [57693261.  ,         nan,    30000.  , ...,         nan,         nan,     4679.7 ],
       [59432726.  ,         nan,    15000.  , ...,         nan,         nan,     1969.83],
       ...,
       [50415990.  ,         nan,    10000.  , ...,         nan,         nan,     2185.64],
       [46154151.  ,         nan,         nan, ...,         nan,         nan,     3199.4 ],
       [66055249.  ,         nan,    10000.  , ...,         nan,         nan,      301.9 ]])

## Checking for Incomplete Data

In [10]:
np.isnan(raw_data).sum()

88005

In [11]:
#temporary fill finds the maximum value in the array (raw_data) and adds 1 to it
temporary_fill = np.nanmax(raw_data) + 1

#Computes the mean values of the raw_data array along the columns, excluding NaN values.
temporary_mean = np.nanmean(raw_data, axis = 0)   

  temporary_mean = np.nanmean(raw_data, axis = 0)


In [12]:
temporary_mean

array([54015809.19,         nan,    15273.46,         nan,    15311.04,         nan,       16.62,
            440.92,         nan,         nan,         nan,         nan,         nan,     3143.85])

In [13]:
#The temporary_stat store the calculated statistics (minimum, mean, and maximum)
temporary_stat = np.array([np.nanmin(raw_data, axis = 0),
                          temporary_mean,
                          np.nanmax(raw_data, axis = 0)])

  temporary_stat = np.array([np.nanmin(raw_data, axis = 0),
  np.nanmax(raw_data, axis = 0)])


In [14]:
temporary_stat

array([[  373332.  ,         nan,     1000.  ,         nan,     1000.  ,         nan,        6.  ,
              31.42,         nan,         nan,         nan,         nan,         nan,        0.  ],
       [54015809.19,         nan,    15273.46,         nan,    15311.04,         nan,       16.62,
             440.92,         nan,         nan,         nan,         nan,         nan,     3143.85],
       [68616519.  ,         nan,    35000.  ,         nan,    35000.  ,         nan,       28.99,
            1372.97,         nan,         nan,         nan,         nan,         nan,    41913.62]])

## Splitting the Dataset

### Splitting the columns

In [19]:
#column_strings: An array containing the indices (column) of NaN values within the temporary_mean array.
column_strings = np.argwhere(np.isnan(temporary_mean))
column_strings

array([[ 1],
       [ 3],
       [ 5],
       [ 8],
       [ 9],
       [10],
       [11],
       [12]], dtype=int64)

In [20]:
#The squeeze() function is then applied to remove any unnecessary dimensions and obtain a 1-dimensional array.
column_strings = np.argwhere(np.isnan(temporary_mean)).squeeze()
column_strings

array([ 1,  3,  5,  8,  9, 10, 11, 12], dtype=int64)

In [21]:
# Columns that only contain numeric value
column_numeric = np.argwhere(np.isnan(temporary_mean) == False).squeeze()
column_numeric

array([ 0,  2,  4,  6,  7, 13], dtype=int64)

### Re-importing the Dataset

In [22]:
#array containing string data loaded from the specified columns of the CSV file.
#usecols=column_strings: Specifies the columns to be loaded.

loan_data_strings = np.genfromtxt('loan-data.csv',
                                 delimiter= ';',
                                 skip_header= 1,
                                 autostrip= True,
                                 usecols= column_strings,
                                 dtype = str)
loan_data_strings

array([['May-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'CA'],
       ['', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261', 'NY'],
       ['Sep-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', 'PA'],
       ...,
       ['Jun-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990', 'CA'],
       ['Apr-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'OH'],
       ['Dec-15', 'Current', '36 months', ..., '',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249', 'IL']],
      dtype='<U69')

In [23]:
#array containing numeric data loaded from the specified columns of the CSV file.


loan_data_numeric = np.genfromtxt('loan-data.csv',
                                 delimiter= ';',
                                 skip_header= 1,
                                 autostrip= True,
                                 usecols= column_numeric,
                                 filling_values= temporary_fill)
loan_data_numeric

array([[48010226.  ,    35000.  ,    35000.  ,       13.33,     1184.86,     9452.96],
       [57693261.  ,    30000.  ,    30000.  , 68616520.  ,      938.57,     4679.7 ],
       [59432726.  ,    15000.  ,    15000.  , 68616520.  ,      494.86,     1969.83],
       ...,
       [50415990.  ,    10000.  ,    10000.  , 68616520.  , 68616520.  ,     2185.64],
       [46154151.  , 68616520.  ,    10000.  ,       16.55,      354.3 ,     3199.4 ],
       [66055249.  ,    10000.  ,    10000.  , 68616520.  ,      309.97,      301.9 ]])

### Name of the Columns

In [25]:
# header_all stores all the columns name
# skip_footer= raw_data.shape[0]: ignore all the rows except 1st one

header_all = np.genfromtxt('loan-data.csv',
                                 delimiter= ';',
                                 skip_footer = raw_data.shape[0],
                                 autostrip= True,
                                 dtype = str)
header_all

array(['id', 'issue_d', 'loan_amnt', 'loan_status', 'funded_amnt', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'verification_status', 'url', 'addr_state',
       'total_pymnt'], dtype='<U19')

In [26]:
header_strings, header_numeric = header_all[column_strings], header_all[column_numeric]

In [27]:
header_strings

array(['issue_d', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

In [28]:
header_numeric

array(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'total_pymnt'], dtype='<U19')

### Creating the Checkpoints

In [29]:
#checkpoint function, which saves data and headers into a NumPy .npz file and returns the loaded checkpoint variable.

def checkpoint(file_name, checkpoint_header, checkpoint_data):
    np.savez(file_name, header = checkpoint_header, data = checkpoint_data)
    checkpoint_variable = np.load(file_name + '.npz')
    return (checkpoint_variable)

In [30]:
checkpoint_test = checkpoint('checkpoint_test', header_strings, loan_data_strings)

In [31]:
checkpoint_test['header']

array(['issue_d', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

In [32]:
checkpoint_test['data']

array([['May-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'CA'],
       ['', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261', 'NY'],
       ['Sep-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', 'PA'],
       ...,
       ['Jun-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990', 'CA'],
       ['Apr-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'OH'],
       ['Dec-15', 'Current', '36 months', ..., '',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249', 'IL']],
      dtype='<U69')

In [33]:
np.array_equal(checkpoint_test['data'], loan_data_strings)

True

## Manipulating String Columns

In [34]:
header_strings

array(['issue_d', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

In [35]:
header_strings[0] = 'issue_date'

In [36]:
header_strings

array(['issue_date', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

### Issue Date

In [37]:
#loan_data_strings array to retrieve all rows (:), but only the first column (0).
loan_data_strings[:, 0]

array(['May-15', '', 'Sep-15', ..., 'Jun-15', 'Apr-15', 'Dec-15'], dtype='<U69')

In [38]:
np.unique(loan_data_strings[:, 0])

array(['', 'Apr-15', 'Aug-15', 'Dec-15', 'Feb-15', 'Jan-15', 'Jul-15', 'Jun-15', 'Mar-15',
       'May-15', 'Nov-15', 'Oct-15', 'Sep-15'], dtype='<U69')

In [40]:
#np.chararray.strip() function is designed to remove leading and trailing characters from string arrays
#In this case, it is used to remove the characters '-15' from each element of the first column.

loan_data_strings[:, 0] = np.chararray.strip(loan_data_strings[:, 0], '-15')

In [42]:
np.unique(loan_data_strings[:, 0])

array(['', 'Apr', 'Aug', 'Dec', 'Feb', 'Jan', 'Jul', 'Jun', 'Mar', 'May', 'Nov', 'Oct', 'Sep'],
      dtype='<U69')

In [43]:
months = np.array(['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec',])

In [44]:
# repalcing the months name into integer with the help of np.where() function
for i in range(13):
    loan_data_strings[:, 0] = np.where(loan_data_strings[:, 0] == months[i],
                                      i,
                                      loan_data_strings[:, 0])
    

In [45]:
np.unique(loan_data_strings[:, 0])

array(['0', '1', '10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='<U69')

### Loan Status

In [47]:
header_strings

array(['issue_date', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

In [48]:
np.unique(loan_data_strings[:, 1])

array(['', 'Charged Off', 'Current', 'Default', 'Fully Paid', 'In Grace Period', 'Issued',
       'Late (16-30 days)', 'Late (31-120 days)'], dtype='<U69')

In [49]:
np.unique(loan_data_strings[:, 1]).size

9

In [50]:
status_bad = np.array(['', 'Charged Off', 'Default', 'Late (31-120 days)'])

In [51]:
# replacing the loan_status column with 0 or 1 where 0 = bad, 1 = good
# The np.isin() function is used to check if each element in the second column is present in the status_bad array
# If an element is found in status_bad, the corresponding element in the second column is replaced with 0
# If an element is not found in status_bad, the element is replaced with 1

loan_data_strings[:, 1] = np.where(np.isin(loan_data_strings[:, 1], status_bad), 0, 1)

In [52]:
np.unique(loan_data_strings[:, 1])

array(['0', '1'], dtype='<U69')

### Term

In [53]:
header_strings

array(['issue_date', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

In [54]:
np.unique(loan_data_strings[:, 2])

array(['', '36 months', '60 months'], dtype='<U69')

In [55]:
loan_data_strings[:, 2] = np.chararray.strip(loan_data_strings[:, 2], ' months')
loan_data_strings[:, 2]

array(['36', '36', '36', ..., '36', '36', '36'], dtype='<U69')

In [56]:
header_strings[2] = 'term_months'
header_strings

array(['issue_date', 'loan_status', 'term_months', 'grade', 'sub_grade', 'verification_status',
       'url', 'addr_state'], dtype='<U19')

In [57]:
# replacing the empty term with worst case scenario '60' months
loan_data_strings[:, 2] = np.where(loan_data_strings[:, 2] == '',
                                  '60',
                                  loan_data_strings[:, 2])

In [58]:
np.unique(loan_data_strings[:, 2])

array(['36', '60'], dtype='<U69')