In [1]:
import numpy as np

### Checking for Missing Values

In [2]:
lending_com_data = np.loadtxt("Lending-company-Numeric.csv", 
                              delimiter = ',')
#we have used np.loadtxt to check if there are any missing values
#since loadtxt will not work if there are missing values

In [3]:
np.isnan(lending_com_data).sum()
#np.isnan is used to know if there is any missing data in the given data set
#here True means there are missing values and False means there are none
#.sum is used to sum the number of missing values
#as True represents 1 and False 0
#the sum tells us in this case there are no missing values

0

In [4]:
#lending_com_data_NAN = np.loadtxt("Lending-company-Numeric-NAN.csv", 
                              #delimiter = ';')
    #this code showed error so the given file has missing values

In [5]:
lending_com_data_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                              delimiter = ';')
#np.genfromtxt doesn't show errors

In [6]:
np.isnan(lending_com_data_NAN).sum()

260

In [7]:
#one way of filling the missing values is
lending_com_data_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv",
                                     delimiter = ';',
                                    filling_values = 0)

In [8]:
np.isnan(lending_com_data_NAN).sum()
#now all the missing values have been replaced with 0
#but since 0 can have a significant representation
#we should use some alternate

0

In [9]:
lending_com_data_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv",
                                     delimiter = ';')
temporary_fill = np.nanmax(lending_com_data_NAN).round(2) + 1
#here we are creating a temporary fill value
#this value will be greater then the biggest value in the data set by 1
#a good practice is to round off the numbers when working with floats

In [10]:
temporary_fill

64002.0

In [11]:
lending_com_data_NAN = np.genfromtxt('Lending-company-Numeric-NAN.csv',
                                    delimiter = ';',
                                    filling_values = temporary_fill)

In [12]:
np.isnan(lending_com_data_NAN).sum()

0

### Substituting Missing Values

Up till now we have placed the biggest value + 1 of the array in the place of missing values. But this can create a huge problem. The more prefered way is to substitute the values with the mean of the given column as it does not affect the overall deviation of data. 

In [13]:
lending_com_data_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv",
                                    delimiter = ';')
lending_com_data_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [14]:
temporary_mean = np.nanmean(lending_com_data_NAN,
                           axis = 0).round(2)

In [15]:
temporary_mean[0]

2250.25

In [16]:
temporary_fill = np.nanmax(lending_com_data_NAN).round(2) + 1

lending_com_data_NAN = np.genfromtxt('Lending-company-Numeric-NAN.csv',
                                    delimiter = ';',
                                    filling_values = temporary_fill)

In [17]:
temporary_fill

64002.0

In [18]:
np.mean(lending_com_data_NAN[:,0]).round(2)

4263.25

In [19]:
temporary_mean[0]

2250.25

In [20]:
lending_com_data_NAN[:,0] = np.where(lending_com_data_NAN[:,0] == temporary_fill,
                                    temporary_mean[0],
                                    lending_com_data_NAN[:,0])

The above code can be break down in following steps:

1.) We are checking the 1st columns of the given data set, and if that column contains a value which is equal to the temporary_fill value which in our case is 64002.0 we replace it with the temporary_mean of the 1st column which we have stored above

2.) If the value is not equal to 64002.0 it remains the same 

In [21]:
np.mean(lending_com_data_NAN[:,0]).round(2)
#this proves that subsitiuting missing values with the mean value makes no difference to the value of the mean

2250.25

In [22]:
#now if we want to do this to all the columns we use a for loop...
for i in range(lending_com_data_NAN.shape[1]): #number of columns
    lending_com_data_NAN[:,i] = np.where(lending_com_data_NAN[:,i] == temporary_fill,
                                    temporary_mean[i],
                                    lending_com_data_NAN[:,i])

Now all the missing values have been changed from 64002.0 to the respective mean of the columns

In [23]:
for i in range(lending_com_data_NAN.shape[1]): #number of columns
    lending_com_data_NAN[:,i] = np.where(lending_com_data_NAN[:,i] < 0,
                                    0,
                                    lending_com_data_NAN[:,i])

This changes all the negative value to zero

### Reshaping Ndarrays

In [24]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv",
                                    delimiter = ',')

In [25]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

<b>RESHAPING:</b>

Is the act of morphing the shape of an object a certain way. Usually it is used when a certain condition is needed to met. It's not always possible to store the output of a functionas a part of an existing aray (or Series).

YES there are certaibn restrictions to the shape we can give toa array since we have a fixed amount of data available.

In [26]:
lending_co_data_numeric.shape

(1043, 6)

In [27]:
np.reshape(lending_co_data_numeric, (6, 1043))
#the way reshape works is it flattens the array
#then it prints the first 1043 entries in 1st row
# then the 2nd and so on...
#so it does not make rows into columns...for that purpose transpose is used

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [28]:
np.transpose(lending_co_data_numeric)

array([[ 2000.,  2000.,  1000., ...,  2000.,  1000.,  2000.],
       [   40.,    40.,    40., ...,    40.,    40.,    40.],
       [  365.,   365.,   365., ...,   365.,   365.,   365.],
       [ 3121.,  3061.,  2160., ...,  4201.,  2080.,  4601.],
       [ 4241.,  4171.,  3280., ...,  5001.,  3320.,  4601.],
       [13621., 15041., 15340., ..., 16600., 15600., 16600.]])

In [29]:
#np.reshape(lending_co_data_numeric,(3,500))
#this code will show error
#as the dimensions are less then the specific requirement which is 
# m*n = 6258

In [30]:
np.reshape(lending_co_data_numeric,(3,2086))

array([[ 2000.,    40.,   365., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  4601.,  4601., 16600.]])

In [31]:
np.reshape(lending_co_data_numeric,(2,3,1043))
#you can change the dimesions of array

array([[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
        [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
        [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.]],

       [[ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
        [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
        [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]])

In [32]:
#another example to change the dimension is
np.reshape(lending_co_data_numeric,(1,1,2,3,1043))

array([[[[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
          [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
          [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.]],

         [[ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
          [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
          [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]]]])

###### Adding dimension is useful when a method or function only takes inputs with a higher number of dimensions than the array we want to plug in

In [33]:
lending_co_data_numeric
#so reshaping doesn't affect the dataset
#it just changes the position of the given data
#if you want to use the reshaped data then you need to store it seperately

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [34]:
#another way to reshape data is to use .reshape()
lending_co_data_numeric.reshape(6,1043)

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

### Removing Values

In [35]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv",
                                    delimiter = ',')

In [36]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [37]:
#to delete the first element of the sheet
np.delete(lending_co_data_numeric, 0).shape

(6257,)

In [38]:
lending_co_data_numeric.size
#since we have not stored the deleted array the value is not changed

6258

In [39]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [40]:
#when you want to delete a complete row or column
np.delete(lending_co_data_numeric, 0, axis = 0)
# axis = 0 delete rows
#axis = 1 delete columns

array([[ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [41]:
np.delete(lending_co_data_numeric, 0, axis = 1)

array([[   40.,   365.,  3121.,  4241., 13621.],
       [   40.,   365.,  3061.,  4171., 15041.],
       [   40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  4201.,  5001., 16600.],
       [   40.,   365.,  2080.,  3320., 15600.],
       [   40.,   365.,  4601.,  4601., 16600.]])

In [42]:
np.delete(lending_co_data_numeric, 1, axis = 1)
#you change the col no. and that column is deleted accordingly 

array([[ 2000.,   365.,  3121.,  4241., 13621.],
       [ 2000.,   365.,  3061.,  4171., 15041.],
       [ 1000.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,   365.,  4201.,  5001., 16600.],
       [ 1000.,   365.,  2080.,  3320., 15600.],
       [ 2000.,   365.,  4601.,  4601., 16600.]])

In [43]:
np.delete(lending_co_data_numeric, (0,2,4), axis = 1)

array([[   40.,  3121., 13621.],
       [   40.,  3061., 15041.],
       [   40.,  2160., 15340.],
       ...,
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.],
       [   40.,  4601., 16600.]])

In [44]:
#but when you want delete rows and columns
np.delete(np.delete(lending_co_data_numeric, 
                    [0,2,4],
                    axis = 1),
         [0,2,-1],
         axis = 0)

array([[   40.,  3061., 15041.],
       [   40.,  3041., 15321.],
       [   50.,  3470., 13720.],
       ...,
       [   40.,  4240., 16600.],
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.]])

### Sorting Data

In [45]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv",
                                    delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [46]:
np.sort(lending_co_data_numeric)
#here the np.sort has sorted all the rows in accending order
#as the default axis of sort is -1

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [47]:
np.sort(lending_co_data_numeric).shape
#the dimensions are not changed by the sort function

(1043, 6)

In [48]:
#The proof of the above statement
lending_co_data_numeric.shape

(1043, 6)

In [49]:
np.sort(lending_co_data_numeric, axis = 0)
#now the columns have sorted 
#but since we can't see them properly...

array([[ 1.0000e+03,  3.5000e+01,  3.6500e+02, -2.8700e+03, -2.8700e+03,
        -3.5000e+02],
       [ 1.0000e+03,  3.5000e+01,  3.6500e+02, -2.5500e+03, -2.1000e+03,
         1.5000e+02],
       [ 1.0000e+03,  3.5000e+01,  3.6500e+02, -2.4500e+03, -2.0000e+03,
         1.1000e+03],
       ...,
       [ 9.0000e+03,  1.2500e+02,  3.6500e+02,  1.6751e+04,  1.8751e+04,
         5.4625e+04],
       [ 9.0000e+03,  1.6500e+02,  3.6500e+02,  1.7650e+04,  2.0001e+04,
         5.4625e+04],
       [ 9.0000e+03,  1.6500e+02,  3.6500e+02,  1.9001e+04,  2.2001e+04,
         6.4001e+04]])

In [50]:
np.set_printoptions(suppress = True)
#THIS SETTING APPLIES ON ALL WORK AFTER THIS
#SO PLEASE BE CAREFUL WHILE USING IT

In [51]:
np.sort(lending_co_data_numeric, axis = 0)
#now it's clearly visible that the numbers in columns have changed
#rather sorted

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [52]:
np.sort(lending_co_data_numeric, axis = None)
#creates a flattened array with all values sorted in accending

array([-2870., -2870., -2550., ..., 54625., 54625., 64001.])

In [53]:
#since the np.sort has no funtionality to make the array in decending
#we can use the following trick
-np.sort(-lending_co_data_numeric)
#the - with lending_co_data_numeric makes the values inside as negative
# the - with np.sort multiplies -1 to the output

array([[13621.,  4241.,  3121.,  2000.,   365.,    40.],
       [15041.,  4171.,  3061.,  2000.,   365.,    40.],
       [15340.,  3280.,  2160.,  1000.,   365.,    40.],
       ...,
       [16600.,  5001.,  4201.,  2000.,   365.,    40.],
       [15600.,  3320.,  2080.,  1000.,   365.,    40.],
       [16600.,  4601.,  4601.,  2000.,   365.,    40.]])

In [54]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [55]:
np.sort(lending_co_data_numeric[:, 3])
#np.sort doesn't affect the data
#if we want to work with the sorted data we need to store it

array([-2870., -2550., -2450., ..., 16751., 17650., 19001.])

In [56]:
#but if we want to sort the original data we use...
lending_co_data_numeric.sort(axis = 0)
lending_co_data_numeric

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

### Argument Functions

In [57]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv",
                                    delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [58]:
np.argsort(lending_co_data_numeric)
#.argsort shows the sorted version of the dataset in the form 
#of there index

array([[1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       ...,
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5]], dtype=int64)

In [59]:
np.sort(lending_co_data_numeric)
#by compairing this with the above array the same can be proved

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [60]:
np.argsort(lending_co_data_numeric,
          axis = 0)

array([[ 537,  443,    0,   32,   32,  482],
       [ 639,  327,  687,  166,  166,  493],
       [ 849,  432,  688,   85,   85,  166],
       ...,
       [  27,  326,  355,  568, 1019,  568],
       [ 277,   27,  357,  718, 1033,  534],
       [ 420,  408, 1042,  912,  912,   27]], dtype=int64)

In [61]:
lending_co_data_numeric[482,5]
#checking for the 1st entery in the last column

-350.0

In [62]:
np.sort(lending_co_data_numeric, axis = 0)
#hence PROVED

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [63]:
lending_co_data_numeric = lending_co_data_numeric[np.argsort(lending_co_data_numeric[:,0])]
lending_co_data_numeric
#when you want to store your result in the actual data
#here [] is called the condition

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2200.,  3800., 15600.],
       [ 1000.,    40.,   365.,  2000.,  3950., 15600.],
       ...,
       [ 9000.,   165.,   365., 14501., 16846., 64001.],
       [ 9000.,   125.,   365., 12001., 15751., 38626.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

In [64]:
#we can also use ndarray.argsort() in place of np.argsort()
#unlike ndarry.sort() this doesn't change the actual data
lending_co_data_numeric.argsort(axis=0)

array([[   0,   22,    0,  199,  199,  172],
       [ 155,   62,  687,   53,   53,  160],
       [ 156,   38,  688,  169,  169,   53],
       ...,
       [1022, 1042,  355, 1024, 1037, 1023],
       [1031, 1039,  357,  941, 1029, 1024],
       [1042, 1040, 1042, 1027, 1027, 1040]], dtype=int64)

In [65]:
lending_co_data_numeric

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2200.,  3800., 15600.],
       [ 1000.,    40.,   365.,  2000.,  3950., 15600.],
       ...,
       [ 9000.,   165.,   365., 14501., 16846., 64001.],
       [ 9000.,   125.,   365., 12001., 15751., 38626.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

### np.argwhere()

In [66]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv",
                                    delimiter = ',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [67]:
np.argwhere(lending_co_data_numeric)
#np.argwhere() goes over the entire NDarray
# and checks whether the individual element satisfies given condition
#the output are the indices for all the individual elements where the condition is met
#default condition is to check for values different from 0

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]], dtype=int64)

In [68]:
np.argwhere(lending_co_data_numeric == False)

array([[116,   4],
       [430,   3]], dtype=int64)

In [69]:
lending_co_data_numeric[116]
#So the output is correct

array([ 1000.,    50.,   365., -1450.,     0., 13850.])

In [70]:
np.argwhere(lending_co_data_numeric < 1000)
#and other condtions as well like to check even numbers % 2 == 0, etc

array([[   0,    1],
       [   0,    2],
       [   1,    1],
       ...,
       [1041,    2],
       [1042,    1],
       [1042,    2]], dtype=int64)

In [71]:
np.isnan(lending_co_data_numeric).sum()

0

In [72]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv",
                                           delimiter = ';')
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [73]:
np.isnan(lending_co_data_numeric_NAN)
#isnan can tell us only if there is a NaN value
#but can tell us where it is

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [ True, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [74]:
#So we use this approch...
np.argwhere(np.isnan(lending_co_data_numeric_NAN))

array([[  11,    3],
       [  15,    3],
       [  27,    3],
       [  58,    3],
       [  60,    4],
       [  85,    4],
       [ 117,    5],
       [ 152,    1],
       [ 152,    2],
       [ 152,    4],
       [ 172,    1],
       [ 175,    1],
       [ 175,    2],
       [ 176,    3],
       [ 177,    4],
       [ 178,    5],
       [ 211,    3],
       [ 229,    0],
       [ 230,    1],
       [ 237,    1],
       [ 247,    3],
       [ 251,    5],
       [ 252,    4],
       [ 258,    1],
       [ 260,    3],
       [ 262,    4],
       [ 271,    5],
       [ 272,    4],
       [ 284,    2],
       [ 284,    3],
       [ 297,    1],
       [ 297,    2],
       [ 300,    3],
       [ 315,    3],
       [ 315,    5],
       [ 327,    4],
       [ 336,    4],
       [ 343,    0],
       [ 344,    2],
       [ 346,    2],
       [ 363,    3],
       [ 375,    3],
       [ 377,    2],
       [ 398,    5],
       [ 416,    4],
       [ 428,    0],
       [ 432,    1],
       [ 433,

In [75]:
lending_co_data_numeric_NAN[11]
#hence proved

array([ 2000.,    50.,   365.,    nan,  4190., 10790.])

In [76]:
for array_index in np.argwhere(np.isnan(lending_co_data_numeric_NAN)):
    lending_co_data_numeric_NAN[array_index[0], array_index[1]] = 0

###### The explaination of the above code is as follows:

1.) The for loop will enter the <b>np.argwhere(np.isnan(lending_co_data_numeric_NAN))</b> which contains all coordinates/indexes/positions of the NaN objects and array_index will take the individual values of the same

2.) In the loop body lending_co_data_numeric_NAN[], will get the respective indexes and the NaN will be replaced with a 0

In [77]:
lending_co_data_numeric_NAN[11]
#we have successfully changed all NaN values to 0

array([ 2000.,    50.,   365.,     0.,  4190., 10790.])

In [78]:
#just to cross check
np.isnan(lending_co_data_numeric_NAN).sum()

0

### Shuffling Data

In [79]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv",
                                    delimiter = ',')[:8]
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.]])

In [80]:
np.random.shuffle(lending_co_data_numeric)
#np.random.shuffle() takes an ndarray and shufffles it in place
#It only saves the shuffled array over the original one

In [81]:
lending_co_data_numeric

array([[ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.]])

In [82]:
np.random.shuffle(lending_co_data_numeric)
lending_co_data_numeric
#you can shuffle the dataset as many time as you want

array([[ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.]])

In [83]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv",
                                    delimiter = ',')

In [84]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [85]:
#usually it is adviced that if you are going to use the same function
#or methods many times just import it
from numpy.random import shuffle

In [86]:
shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3400.,  4240., 16240.],
       [ 4000.,    50.,   365.,  5300.,  6800., 18900.],
       [ 1000.,    40.,   365.,  2130.,  3510., 14350.],
       ...,
       [ 1000.,    40.,   365.,  2120.,  3440., 15600.],
       [ 4000.,    50.,   365.,  5700.,  7200., 22250.],
       [ 1000.,    40.,   365.,  3500.,  3500., 15600.]])

In [87]:
#another way of doing shuffling is to use random data generator
from numpy.random import Generator as gen
from numpy.random import PCG64 as pcg

array_RG = gen(pcg())

In [88]:
array_RG.shuffle(lending_co_data_numeric)
lending_co_data_numeric
#this another way of shuffling

array([[ 1000.,    50.,   365., -1900.,  -400., 14400.],
       [ 1000.,    40.,   365.,  3100.,  3700., 12720.],
       [ 1000.,    40.,   365.,  3130.,  4430., 15600.],
       ...,
       [ 1000.,    40.,   365.,  2200.,  4600., 15600.],
       [ 4000.,    50.,   365.,  5300.,  6600., 18950.],
       [ 1000.,    40.,   365.,  3020.,  5020., 15600.]])

In [89]:
#even if you fix the seed the shuuffling will prevail
#even if seed is fixed shuffling will happen
array_RG = gen(pcg(seed = 123))
array_RG.shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  4901.,  5101., 16191.],
       [ 1000.,    40.,   365.,  2240.,  3680., 15600.],
       [ 1000.,    40.,   365.,  2080.,  3960., 15600.],
       ...,
       [ 9000.,   125.,   365., 13001., 16726., 54625.],
       [ 2000.,    50.,   365.,  1550.,  3050., 20250.],
       [ 2000.,    50.,   365.,  3400.,  4850., 19750.]])