## Importing the Packages

In [7]:
import numpy as np

In [8]:
np.set_printoptions(suppress = True, linewidth= 100, precision = 2)

## Importing the Dataset

In [9]:
# delimiter=';': Specifies the delimiter
#skip_header=1: Skips the first row of the CSV file
#autostrip=True: Trims leading and trailing whitespaces from the loaded data

raw_data = np.genfromtxt('loan-data.csv',
                         delimiter =';',
                         skip_header = 1,
                         autostrip = True)
raw_data

array([[48010226.  ,         nan,    35000.  , ...,         nan,         nan,     9452.96],
       [57693261.  ,         nan,    30000.  , ...,         nan,         nan,     4679.7 ],
       [59432726.  ,         nan,    15000.  , ...,         nan,         nan,     1969.83],
       ...,
       [50415990.  ,         nan,    10000.  , ...,         nan,         nan,     2185.64],
       [46154151.  ,         nan,         nan, ...,         nan,         nan,     3199.4 ],
       [66055249.  ,         nan,    10000.  , ...,         nan,         nan,      301.9 ]])

## Checking for Incomplete Data

In [10]:
np.isnan(raw_data).sum()

88005

In [11]:
#temporary fill finds the maximum value in the array (raw_data) and adds 1 to it
temporary_fill = np.nanmax(raw_data) + 1

#Computes the mean values of the raw_data array along the columns, excluding NaN values.
temporary_mean = np.nanmean(raw_data, axis = 0)   

  temporary_mean = np.nanmean(raw_data, axis = 0)


In [12]:
temporary_mean

array([54015809.19,         nan,    15273.46,         nan,    15311.04,         nan,       16.62,
            440.92,         nan,         nan,         nan,         nan,         nan,     3143.85])

In [13]:
#The temporary_stat store the calculated statistics (minimum, mean, and maximum)
temporary_stat = np.array([np.nanmin(raw_data, axis = 0),
                          temporary_mean,
                          np.nanmax(raw_data, axis = 0)])

  temporary_stat = np.array([np.nanmin(raw_data, axis = 0),
  np.nanmax(raw_data, axis = 0)])


In [14]:
temporary_stat

array([[  373332.  ,         nan,     1000.  ,         nan,     1000.  ,         nan,        6.  ,
              31.42,         nan,         nan,         nan,         nan,         nan,        0.  ],
       [54015809.19,         nan,    15273.46,         nan,    15311.04,         nan,       16.62,
             440.92,         nan,         nan,         nan,         nan,         nan,     3143.85],
       [68616519.  ,         nan,    35000.  ,         nan,    35000.  ,         nan,       28.99,
            1372.97,         nan,         nan,         nan,         nan,         nan,    41913.62]])

## Splitting the Dataset

### Splitting the columns

In [19]:
#column_strings: An array containing the indices (column) of NaN values within the temporary_mean array.
column_strings = np.argwhere(np.isnan(temporary_mean))
column_strings

array([[ 1],
       [ 3],
       [ 5],
       [ 8],
       [ 9],
       [10],
       [11],
       [12]], dtype=int64)

In [20]:
#The squeeze() function is then applied to remove any unnecessary dimensions and obtain a 1-dimensional array.
column_strings = np.argwhere(np.isnan(temporary_mean)).squeeze()
column_strings

array([ 1,  3,  5,  8,  9, 10, 11, 12], dtype=int64)

In [21]:
# Columns that only contain numeric value
column_numeric = np.argwhere(np.isnan(temporary_mean) == False).squeeze()
column_numeric

array([ 0,  2,  4,  6,  7, 13], dtype=int64)

### Re-importing the Dataset

In [22]:
#array containing string data loaded from the specified columns of the CSV file.
#usecols=column_strings: Specifies the columns to be loaded.

loan_data_strings = np.genfromtxt('loan-data.csv',
                                 delimiter= ';',
                                 skip_header= 1,
                                 autostrip= True,
                                 usecols= column_strings,
                                 dtype = str)
loan_data_strings

array([['May-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'CA'],
       ['', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261', 'NY'],
       ['Sep-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', 'PA'],
       ...,
       ['Jun-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990', 'CA'],
       ['Apr-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'OH'],
       ['Dec-15', 'Current', '36 months', ..., '',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249', 'IL']],
      dtype='<U69')

In [23]:
#array containing numeric data loaded from the specified columns of the CSV file.


loan_data_numeric = np.genfromtxt('loan-data.csv',
                                 delimiter= ';',
                                 skip_header= 1,
                                 autostrip= True,
                                 usecols= column_numeric,
                                 filling_values= temporary_fill)
loan_data_numeric

array([[48010226.  ,    35000.  ,    35000.  ,       13.33,     1184.86,     9452.96],
       [57693261.  ,    30000.  ,    30000.  , 68616520.  ,      938.57,     4679.7 ],
       [59432726.  ,    15000.  ,    15000.  , 68616520.  ,      494.86,     1969.83],
       ...,
       [50415990.  ,    10000.  ,    10000.  , 68616520.  , 68616520.  ,     2185.64],
       [46154151.  , 68616520.  ,    10000.  ,       16.55,      354.3 ,     3199.4 ],
       [66055249.  ,    10000.  ,    10000.  , 68616520.  ,      309.97,      301.9 ]])

### Name of the Columns

In [None]:
# header_all stores all the columns name
# skip_footer= raw_data.shape[0]: ignore all the rows except 1st one

header_all = np.genfromtxt('loan-data.csv',
                                 delimiter= ';',
                                 skip_footer = raw_data.shape[0],
                                 autostrip= True,
                                 usecols= column_strings,
                                 dtype = str)
header_all