In [1]:
import numpy as np
import os

In [82]:
dir_sources = os.path.abspath(os.path.join(os.getcwd(),'data_analyst','Resources','Data'))
data_source = 'Lending-company-Numeric.csv'
data_source_nan = 'Lending-company-Numeric-NAN.csv'
data_source_mixed = 'Lending-Company-Saving.csv'
data = []
data_nan = []
data_mixed = []

In [89]:
def load_sources():
    global data
    global data_nan
    global data_mixed
    data = np.loadtxt(os.path.join(dir_sources,data_source),delimiter=',')
    data_nan = np.genfromtxt(os.path.join(dir_sources,data_source_nan),delimiter=';')
    data_mixed = np.genfromtxt(os.path.join(dir_sources,data_source_mixed),delimiter=',',dtype=str)
load_sources()

In [4]:
data_has_nan = np.isnan(data).sum()
data_nan_has_nan = np.isnan(data_nan).sum()
print(f'Clean file has {data_has_nan} nan')
print(f'Not clean file has {data_nan_has_nan} nan')

Clean file has 0 nan
Not clean file has 260 nan


In [5]:
# we can 'fix' NaN by filling with 0
data_nan = np.genfromtxt(os.path.join(dir_sources,data_source_nan),delimiter=';',filling_values = 0)
np.isnan(data_nan).sum()

0

In [6]:
# The above strategy fails apart if the dataset happens to have 0s.
# Instead we can use an arbitrary number (for instance the highest value in dataset + 1)
# For that, first we load data without filling
data_nan = np.genfromtxt(os.path.join(dir_sources,data_source_nan),delimiter=';')
# remember we need to use nan version of max to handle nan values
data_nan_maxval_plus_one = np.nanmax(data_nan).round(2) + 1
# storage the mean of each column
temporary_mean = np.nanmean(data_nan,axis=0).round(2)
# re-open the data filling with the arbitrary number
data_nan = np.genfromtxt(os.path.join(dir_sources,data_source_nan),delimiter=';', filling_values=data_nan_maxval_plus_one)
print(f'Number of NaN found after filling {np.isnan(data_nan).sum()}')
print('--------------------------------------------------------------------------------------------')
# Now, by filling NaN values we are distorting the calculations (mean,avg, std, ...)
print(f'np.nanmean of dataset column 0 -> {temporary_mean[0]}')
print(f'{np.mean(data_nan[:,0]).round(2)} this is the value of distorted mean')
# How to fix it?
data_nan[:,0] = np.where(data_nan[:,0] == data_nan_maxval_plus_one,
                          temporary_mean[0],
                         data_nan[:,0])
print(f'{np.mean(data_nan[:,0]).round(2)} this is the value after replacing the values for the actual mean')
print('--------------------------------------------------------------------------------------------')
# Now we want to do the same for each column
for c in range(data_nan.shape[1]):
    data_nan[:,c] = np.where(data_nan[:,c] == data_nan_maxval_plus_one, temporary_mean[c], data_nan[:,c])
    # Now check against the mean
    print(f'For column {c} does it match? {np.mean(data_nan[:,c]).round(2) == temporary_mean[c]}')

Number of NaN found after filling 0
--------------------------------------------------------------------------------------------
np.nanmean of dataset column 0 -> 2250.25
4263.25 this is the value of distorted mean
2250.25 this is the value after replacing the values for the actual mean
--------------------------------------------------------------------------------------------
For column 0 does it match? True
For column 1 does it match? True
For column 2 does it match? True
For column 3 does it match? True
For column 4 does it match? True
For column 5 does it match? True


In [7]:
def find_divisors(X):
    divisors = []
    for i in range(1, X + 1):
        if X % i == 0:
            divisors.append(i)
    return divisors

# re-shaping
# very particular cases when you want to wrap/unwrap the rows
# rows/cols size can't be random, it has to follow a logical wrap
total_size = (data_nan.shape[0] * data_nan.shape[1])
# pick an arbitrary divisor
arbitrary_divisor = 5
divisor = find_divisors(total_size)[arbitrary_divisor]
new_shape_rows= divisor
new_shape_cols= int(total_size / divisor)
data_reshape = np.reshape(data_nan,newshape=(new_shape_rows, new_shape_cols))
data_reshape.shape

(14, 447)

In [8]:
# delete data
print(data_nan.shape)
np.delete(data_nan, np.s_[::2],axis=1).shape

(1043, 6)


(1043, 3)

In [9]:
# sorting rows/columns independently
# set option to not display cientific notation (AKA: 000e+0X)
np.set_printoptions(suppress = True)
# Notice this is treating each column/row individually and sorting them
# to sort as a database table use np.argsort or np.lexsort (multi-col)
data_sorted_by_col = np.sort(data_nan,axis=0)
data_sorted_by_row = np.sort(data_nan,axis=1)
# in order to switch between asc -> desc order, we can add '-' to the operation as in:
# notice there are 2 '-' signs. -data_nan inverts sign of every element in the matrix
# the outer '-' (-np.sort), reverts it
data_sorted_by_row_desc = -np.sort(-data_nan,axis=1)
print(data_sorted_by_col)
print(data_sorted_by_row)
print(data_sorted_by_row_desc)
# if you need to save the sorting output
# either assign it to new var or use the .sort method on the matrix directly
# data_nan.sort()

[[ 1000.    35.   365. -2870. -2870.  -350.]
 [ 1000.    35.   365. -2550. -2100.  1100.]
 [ 1000.    35.   365. -2450. -1750.  1160.]
 ...
 [ 9000.   125.   365. 16001. 18250. 54625.]
 [ 9000.   165.   365. 16751. 18751. 54625.]
 [ 9000.   165.   365. 17650. 20001. 64001.]]
[[   40.     365.    2000.    3121.    4241.   13621.  ]
 [   40.     365.    2000.    3061.    4171.   15041.  ]
 [   40.     365.    1000.    2160.    3280.   15340.  ]
 ...
 [   40.     365.    2250.25  4201.    5001.   16600.  ]
 [   40.     365.    1000.    2080.    3320.   15600.  ]
 [   40.     365.    2000.    4601.    4601.   16600.  ]]
[[13621.    4241.    3121.    2000.     365.      40.  ]
 [15041.    4171.    3061.    2000.     365.      40.  ]
 [15340.    3280.    2160.    1000.     365.      40.  ]
 ...
 [16600.    5001.    4201.    2250.25   365.      40.  ]
 [15600.    3320.    2080.    1000.     365.      40.  ]
 [16600.    4601.    4601.    2000.     365.      40.  ]]


In [21]:
# Sorting matrices as dependent datasets (AKA tables)
# Very important to remember!
# np.lexsort can take a tuple (m[:,0],m[:,1]) for the sorting arg
# but keep in mind the sorting order is reversed
# you need to insert in order from LAST to FIRST
# in the following sample if we want to sort by NAME then by RATE
# the tuple should be passed as (RATE, NAME)
sample = np.array([['Anthony',5,8],
                  ['Ramon', 4,9],
                  ['Ramon', 2,7],
                  ['Julia',8,5],
                  ['Julia',2,3]])
sorted_index_by_name = np.argsort(sample[:,0],axis=0)
sample_with_index_by_name = sample.copy()[sorted_index_by_name]
print(sample_with_index_by_name)
sorted_index_by_name_and_rate = np.lexsort((sample[:,1],sample[:,0]),axis=0)
sample_with_index_by_name_and_rate = sample.copy()[sorted_index_by_name_and_rate]
print(sorted_index_by_name_and_rate)
print(sample_with_index_by_name_and_rate)

[['Anthony' '5' '8']
 ['Julia' '8' '5']
 ['Julia' '2' '3']
 ['Ramon' '4' '9']
 ['Ramon' '2' '7']]
[0 4 3 2 1]
[['Anthony' '5' '8']
 ['Julia' '2' '3']
 ['Julia' '8' '5']
 ['Ramon' '2' '7']
 ['Ramon' '4' '9']]


In [35]:
# np.argwhere()
# Checks elements against a condition
# Returns array ONLY with the indexes of elements matching the condition
sample = np.array([['Anthony',5,8],
                  ['Ramon', 4,9],
                  ['Ramon', 2,7],
                  ['Julia',8,5],
                  ['Julia',2,3]])
arg = "< 'Elisa'"
indices_column_name_matching = eval('np.argwhere(sample[:,0] ' + arg + ')')
print('indices_column_name_matching ' + arg)
print(indices_column_name_matching)
indices_any_matching = eval('np.argwhere(sample ' + arg + ')')
print('indices_any_matching ' + arg)
print(indices_any_matching)

indices_column_name_matching < 'Elisa'
[[0]]
indices_any_matching < 'Elisa'
[[0 0]
 [0 1]
 [0 2]
 [1 1]
 [1 2]
 [2 1]
 [2 2]
 [3 1]
 [3 2]
 [4 1]
 [4 2]]


In [57]:
# np.argwhere interesting interactions
load_sources()
# we have NaN values in our dataset
count_nan = np.isnan(data_nan).sum()
print(f'{count_nan} values as NaN found')
# we can find all positions
indices_nan = np.argwhere(np.isnan(data_nan))
print(f'{indices_nan.shape[0]} indices of NaN values')
# now we can loop through all indices and do something
for ind in indices_nan:
    data_nan[ind[0],ind[1]] = 0
indices_nan = np.argwhere(np.isnan(data_nan))
print(f'{indices_nan.shape[0]} indices of NaN values after replace')

261 values as NaN found
261 indices of NaN values
0 indices of NaN values after replace


In [73]:
# shuffling
# contrary to most of the functions, shuffle is IN PLACE method
# meaning you won't get back a copy, it modifies the passed array
data_limited = data[:4,:].copy()
print(data_limited)
np.random.shuffle(data_limited)
print('After shuffling')
print(data_limited)
# In theory, we should use the generator lib
from numpy.random import Generator as gen
from numpy.random import PCG64 as pcg
# -----------------------------
# IMPORTANT!
# seed doesn't have any effect on shuffle
# -----------------------------
rnd_gen = gen(pcg())
rnd_gen.shuffle(data_limited)
print('After shuffling')
print(data_limited)

[[ 2000.    40.   365.  3121.  4241. 13621.]
 [ 2000.    40.   365.  3061.  4171. 15041.]
 [ 1000.    40.   365.  2160.  3280. 15340.]
 [ 2000.    40.   365.  3041.  4241. 15321.]]
After shuffling
[[ 2000.    40.   365.  3121.  4241. 13621.]
 [ 1000.    40.   365.  2160.  3280. 15340.]
 [ 2000.    40.   365.  3061.  4171. 15041.]
 [ 2000.    40.   365.  3041.  4241. 15321.]]
After shuffling
[[ 2000.    40.   365.  3061.  4171. 15041.]
 [ 1000.    40.   365.  2160.  3280. 15340.]
 [ 2000.    40.   365.  3121.  4241. 13621.]
 [ 2000.    40.   365.  3041.  4241. 15321.]]


In [81]:
# casting, you need to assign to new variable to keep changes
data_limited = data[:4,:].copy()
print(data_limited)
data_limited = data_limited.astype(dtype= str)
print(data_limited)
# casting directly from str -> int might not be possible in cases with '.'
# we need to do double cast:
# str -> float -> int
# remove below comment line to throw error
#data_limited = data_limited.astype(dtype= 'int')
data_limited = data_limited.astype(dtype= 'float').astype(dtype='int')
print(data_limited)

[[ 2000.    40.   365.  3121.  4241. 13621.]
 [ 2000.    40.   365.  3061.  4171. 15041.]
 [ 1000.    40.   365.  2160.  3280. 15340.]
 [ 2000.    40.   365.  3041.  4241. 15321.]]
[['2000.0' '40.0' '365.0' '3121.0' '4241.0' '13621.0']
 ['2000.0' '40.0' '365.0' '3061.0' '4171.0' '15041.0']
 ['1000.0' '40.0' '365.0' '2160.0' '3280.0' '15340.0']
 ['2000.0' '40.0' '365.0' '3041.0' '4241.0' '15321.0']]
[[ 2000    40   365  3121  4241 13621]
 [ 2000    40   365  3061  4171 15041]
 [ 1000    40   365  2160  3280 15340]
 [ 2000    40   365  3041  4241 15321]]


In [135]:
# string manipulation
data_str = data_mixed[1:,[2,4,5]].copy()
print(data_str)
# clean up leading text
data_clean_1 = np.chararray.strip(data_str,['Product ','Location ','Region '])
print("Removing leading strings")
print(data_clean_1)
# substitute product (A,B,C) for numbers
# np.where can do the trick but is not very clean (visually)
conditions = [data_clean_1[:,0] == 'A',
              data_clean_1[:,0] == 'B',
              data_clean_1[:,0] == 'C',
              data_clean_1[:,0] == 'D',
              data_clean_1[:,0] == 'E',
              data_clean_1[:,0] == 'F',
             ]
values = [1, 2, 3,4, 5, 6]
data_clean_1[:,0] = np.select(conditions, values,data_clean_1[:,0])
print("Substituying products codes for numbers")
print(data_clean_1)
non_digit_indeces = np.argwhere(~np.chararray.isdigit(data_clean_1[:,0]))
print("Non integers found? (Double check we covered all possible product letters)")
print(non_digit_indeces.flatten())
if non_digit_indeces.size > 0:
    print(data_clean_1[:5,:])

[['Product B' 'Location 2' 'Region 2']
 ['Product B' 'Location 3' '']
 ['Product C' 'Location 5' 'Region 5']
 ...
 ['Product B' 'Location 23' 'Region 4']
 ['Product C' 'Location 52' 'Region 6']
 ['Product B' 'Location 142' 'Region 6']]
Removing leading strings
[['B' '2' '2']
 ['B' '3' '']
 ['C' '5' '5']
 ...
 ['B' '23' '4']
 ['C' '52' '6']
 ['B' '142' '6']]
Substituying products codes for numbers
[['2' '2' '2']
 ['2' '3' '']
 ['3' '5' '5']
 ...
 ['2' '23' '4']
 ['3' '52' '6']
 ['2' '142' '6']]
Non integers found? (Double check we covered all possible product letters)
[]


In [150]:
# stacking
sample = np.array([['Anthony',5,8],
                  ['Ramon', 4,9],
                  ['Ramon', 2,7],
                  ['Julia',8,5],
                  ['Julia',2,3]])
# raw stack adds 1 dimension
np.stack( (sample,sample), axis = 0 )
# for 'traditional' stacking use alternatives
stack_v = np.vstack( (sample,sample))
stack_h = np.hstack( (sample,sample))
print('Vertical stack')
print(stack_v)
print('Horizontal stack')
print(stack_h)

Vertical stack
[['Anthony' '5' '8']
 ['Ramon' '4' '9']
 ['Ramon' '2' '7']
 ['Julia' '8' '5']
 ['Julia' '2' '3']
 ['Anthony' '5' '8']
 ['Ramon' '4' '9']
 ['Ramon' '2' '7']
 ['Julia' '8' '5']
 ['Julia' '2' '3']]
Horizontal stack
[['Anthony' '5' '8' 'Anthony' '5' '8']
 ['Ramon' '4' '9' 'Ramon' '4' '9']
 ['Ramon' '2' '7' 'Ramon' '2' '7']
 ['Julia' '8' '5' 'Julia' '8' '5']
 ['Julia' '2' '3' 'Julia' '2' '3']]
