## Importing Data with NumPy

In [1]:
import numpy as np

### np.loadtxt() vs np.genfromtxt()

2 ways of importing CSV-files:

1. np.loadtxt()

    - data only needs to be 'loaded' for use: data can be directly imported and used
    - faster, but less flexibility

2. np.genfromtxt()

    - dataset needs to be 'generated': data needs to be put in array while going through text file
    - slower, more flexibility

In [2]:
lending_co_data_numeric_1 = np.loadtxt("Lending-Company-Numeric-Data.csv", delimiter = ',')
lending_co_data_numeric_1

# We can use Notepad++ to determine delimiters

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [3]:
lending_co_data_numeric_2 = np.genfromtxt("Lending-Company-Numeric-Data.csv", delimiter = ',')
lending_co_data_numeric_2

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [4]:
np.array_equal(lending_co_data_numeric_1, lending_co_data_numeric_2)

## np.array_equal() compares 2 (or more) arrays and tells us if whether they're identical
# True = both are the same

True

In [5]:
lending_co_data_numeric_NAN = np.loadtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';')
lending_co_data_numeric_NAN


# np.loadtxt() faster but less flexible:
# ex: np.loadtxt() fails to import incomplete datasets/missing values/NAN by default

ValueError: could not convert string '' to float64 at row 11, column 4.

In [6]:
# we can solve issue in 2 ways:

# 1) by using np.genfromtxt()

lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';')
lending_co_data_numeric_NAN

# we see 'nan' where it did not find numeric value
# np.genfromtxt() works well with missing values

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [7]:
# 2) keep using np.loadtxt() but ask Python to import everything as symbols instead of as numbers
# concretely this is done by adding argument 'dtype = np.str_'
# means we can check all our data, but cannot execute computations

lending_co_data_numeric_NAN = np.loadtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                         delimiter = ';',
                                         dtype = np.str_)
lending_co_data_numeric_NAN

# If we import all the values as text, then we don't get a type inconcsistency, so we can use np.loadtxt()

array([['2000', '40', '365', '3121', '4241', '13621'],
       ['2000', '40', '365', '3061', '4171', '15041'],
       ['1000', '40', '365', '2160', '3280', '15340'],
       ...,
       ['', '40', '365', '4201', '5001', '16600'],
       ['1000', '40', '365', '2080', '3320', '15600'],
       ['2000', '40', '365', '4601', '4601', '16600']], dtype='<U5')

In [8]:
lending_co_data_numeric_NAN[0,0] + lending_co_data_numeric_NAN[0,1]

# Adding '2000' and '40' results in a concatenated '200040' rather than 2040

'200040'

### Partial Cleaning While Importing

In [9]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';') 
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [10]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                            delimiter = ';',
                                            skip_header = 2) 
lending_co_data_numeric_NAN

# skip_header omits first 2 ROWS from the top of the text file
# many different parameters to tweak how you want to import data

array([[ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [11]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                            delimiter = ';',
                                            skip_footer = 2) 
lending_co_data_numeric_NAN

# skip_footer omits 2 ROWS from the bottom of the text file

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  3401.,    nan, 16600.],
       [ 2000.,    40.,   365.,    nan,  5440., 16600.],
       [   nan,    40.,   365.,  4201.,  5001., 16600.]])

In [12]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                            delimiter = ';',
                                            usecols = (5,0,1)) 
lending_co_data_numeric_NAN

# use_cols tells the function to only take the following COLUMNS based on their indices
# we can specify the order in which we want the columns to appear

array([[13621.,  2000.,    40.],
       [15041.,  2000.,    40.],
       [15340.,  1000.,    40.],
       ...,
       [16600.,    nan,    40.],
       [15600.,  1000.,    40.],
       [16600.,  2000.,    40.]])

In [13]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                            delimiter = ';',
                                            usecols = (5,0,1), 
                                            skip_header = 2, 
                                            skip_footer = 2) 
lending_co_data_numeric_NAN

# We can define all these arguments (and many more) together to only import what we want

array([[15340.,  1000.,    40.],
       [15321.,  2000.,    40.],
       [13720.,  2000.,    50.],
       ...,
       [16600.,  2000.,    40.],
       [16600.,  2000.,    40.],
       [16600.,    nan,    40.]])

In [14]:
lending_co_data_5, lending_co_data_0, lending_co_data_1 = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                                                        delimiter = ';',
                                                                        usecols = (5,0,1), 
                                                                        skip_header = 2, 
                                                                        skip_footer = 2, 
                                                                        unpack = True)
print(lending_co_data_5)
print(lending_co_data_0)
print(lending_co_data_1)

# Unpacking allows us to split the output array into smaller 1-D arrays
# order of names and order of outputted columns should be same

[15340. 15321. 13720. ... 16600. 16600. 16600.]
[1000. 2000. 2000. ... 2000. 2000.   nan]
[40. 40. 50. ... 40. 40. 40.]


### String vs Object vs Numbers

In [15]:
# The same dataset is imported differently based on the datatype we define

# way 1: default, non-numerical values rendered in np.genfromtxt as 'nan'

lending_co_lt = np.genfromtxt("lending-co-LT.csv", 
                              delimiter = ',')
print(lending_co_lt)

[[      nan       nan       nan ...       nan       nan       nan]
 [1.000e+00       nan       nan ...       nan       nan 1.660e+04]
 [2.000e+00       nan       nan ...       nan       nan 1.660e+04]
 ...
 [1.041e+03       nan       nan ...       nan       nan 1.660e+04]
 [1.042e+03       nan       nan ...       nan       nan 1.560e+04]
 [1.043e+03       nan       nan ...       nan       nan 1.660e+04]]


In [16]:
# way 2: rendered as integer the 'nan' values are transformed into -1, while scientific notation is rendered as int
# therefore: watch out when doing computations on a preprocessed dataset with missing values!

lending_co_lt = np.genfromtxt("lending-co-LT.csv", 
                              delimiter = ',',
                              dtype = np.int32
                              #dtype = np.float16
                              #dtype = np.str_
                              #dtype = np.object
                              #dtype = (np.int32, np.str_, np.str_, np.str_, np.str_, np.str_, np.int32)
                             )
print(lending_co_lt)

[[   -1    -1    -1 ...    -1    -1    -1]
 [    1    -1    -1 ...    -1    -1 16600]
 [    2    -1    -1 ...    -1    -1 16600]
 ...
 [ 1041    -1    -1 ...    -1    -1 16600]
 [ 1042    -1    -1 ...    -1    -1 15600]
 [ 1043    -1    -1 ...    -1    -1 16600]]


In [17]:
# way 3: rendered as string
# becomes clear why the 'nan' values could not be interpreted as numeric values

lending_co_lt = np.genfromtxt("lending-co-LT.csv", 
                              delimiter = ',',
                              #dtype = np.int32
                              #dtype = np.float16
                              dtype = np.str_
                              #dtype = np.object
                              #dtype = (np.int32, np.str_, np.str_, np.str_, np.str_, np.str_, np.int32)
                             )
print(lending_co_lt)

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [18]:
# way 4: rendered as object
# similar output to string, but 'b' indicates that we are concerned with objects
# necessary for backward compatibility with older versions of np

lending_co_lt = np.genfromtxt("lending-co-LT.csv", 
                              delimiter = ',',
                              #dtype = np.int32
                              #dtype = np.float16
                              #dtype = np.str_
                              dtype = object
                              #dtype = (np.int32, np.str_, np.str_, np.str_, np.str_, np.str_, np.int32)
                             )
print(lending_co_lt)

[[b'LoanID' b'StringID' b'Product' ... b'Location' b'Region'
  b'TotalPrice']
 [b'1' b'id_1' b'Product B' ... b'Location 2' b'Region 2' b'16600.0']
 [b'2' b'id_2' b'Product B' ... b'Location 3' b'' b'16600.0']
 ...
 [b'1041' b'id_1041' b'Product B' ... b'Location 23' b'Region 4'
  b'16600.0']
 [b'1042' b'id_1042' b'Product C' ... b'Location 52' b'Region 6'
  b'15600.0']
 [b'1043' b'id_1043' b'Product B' ... b'Location 142' b'Region 6'
  b'16600.0']]


In [19]:
# way 5: determine dtype per column

lending_co_lt = np.genfromtxt("lending-co-LT.csv", 
                              delimiter = ',',
                              #dtype = np.int32
                              #dtype = np.float16
                              #dtype = np.str_
                              #dtype = object
                              dtype = (np.int32, np.str_, np.str_, np.str_, np.str_, np.str_, np.int32)
                             )
print(lending_co_lt)

[(  -1, '', '', '', '', '',    -1) (   1, '', '', '', '', '', 16600)
 (   2, '', '', '', '', '', 16600) ... (1041, '', '', '', '', '', 16600)
 (1042, '', '', '', '', '', 15600) (1043, '', '', '', '', '', 16600)]
