## Operations for indexing, splitting, slicing and iterating over a dataset

In [1]:
import numpy as np

#### Indexing

In [3]:
dataset = np.genfromtxt('../Datasets/normal_distribution_splittable.csv', delimiter=',')

In [4]:
# Mean of the second row
second_row = dataset[1]
np.mean(second_row)

96.90038836444445

In [5]:
# Mean of the last row
last_row = dataset[-1]
np.mean(last_row)

100.18096645222221

In [6]:
# Mean of the first value of the first row
first_val_first_row = dataset[0][0]
print(np.mean(first_val_first_row))
print(first_val_first_row)

99.14931546
99.14931546


In [7]:
# Index the value of the last element in the second last row
last_val_second_last_row = dataset[-2, -1]
np.mean(last_val_second_last_row)

101.2226037

#### Slicing

In [8]:
# Create a 2x2 matrix that starts in the second row and second column
subsection_2x2 = dataset[1:3, 1:3]
np.mean(subsection_2x2)

95.63393608250001

In [9]:
# Get every element in the 5th row, but only get every second element of that row
every_other_elem = dataset[4, ::2]
print(dataset[4])
print(every_other_elem)
print(np.mean(every_other_elem))

[101.20862522 103.5730309  100.28690912 105.85269352  93.37126331
 108.57980357 100.79478953  94.20019732  96.10020311]
[101.20862522 100.28690912  93.37126331 100.79478953  96.10020311]
98.35235805800001


In [10]:
# Revesed last row of the dataset
reversed_last_row = dataset[-1, ::-1]
print(dataset[-1])
print(reversed_last_row)
print(np.mean(reversed_last_row))

[ 94.11176915  99.62387832 104.51786419  97.62787811  93.97853495
  98.75108352 106.05042487 100.07721494 106.89005002]
[106.89005002 100.07721494 106.05042487  98.75108352  93.97853495
  97.62787811 104.51786419  99.62387832  94.11176915]
100.18096645222222


#### Splitting

In [11]:
# Split horizontally the dataset in three equal subsets
hor_splits = np.hsplit(dataset,(3))

In [12]:
# Split the first third in 2 equal vertically parts
ver_splits = np.vsplit(hor_splits[0],(2))

In [13]:
print("Dataset", dataset.shape)
print("Subset", ver_splits[0].shape)

Dataset (24, 9)
Subset (12, 3)


#### Iterating

In [14]:
# Iterate over the whole dataset using nditer
curr_index = 0
for x in np.nditer(dataset):
    print(x, curr_index)
    curr_index += 1

99.14931546 0
104.03852715 1
107.43534677 2
97.85230675 3
98.74986914 4
98.80833412 5
96.81964892 6
98.56783189 7
101.34745901 8
92.02628776 9
97.10439252 10
99.32066924 11
97.24584816 12
92.9267508 13
92.65657752 14
105.7197853 15
101.23162942 16
93.87155456 17
95.66253664 18
95.17750125 19
90.93318132 20
110.18889465 21
98.80084371 22
105.95297652 23
98.37481387 24
106.54654286 25
107.22482426 26
91.37294597 27
100.96781394 28
100.40118279 29
113.42090475 30
105.48508838 31
91.6604946 32
106.1472841 33
95.08715803 34
103.40412146 35
101.20862522 36
103.5730309 37
100.28690912 38
105.85269352 39
93.37126331 40
108.57980357 41
100.79478953 42
94.20019732 43
96.10020311 44
102.80387079 45
98.29687616 46
93.24376389 47
97.24130034 48
89.03452725 49
96.2832753 50
104.60344836 51
101.13442416 52
97.62787811 53
106.71751618 54
102.97585605 55
98.45723272 56
100.72418901 57
106.39798503 58
95.46493436 59
94.35373179 60
106.83273763 61
100.07721494 62
96.02548256 63
102.82360856 64
106.475518

In [15]:
# Iterate over the whole dataset using ndenumerate
for index, value in np.ndenumerate(dataset):
    print(index, value)

(0, 0) 99.14931546
(0, 1) 104.03852715
(0, 2) 107.43534677
(0, 3) 97.85230675
(0, 4) 98.74986914
(0, 5) 98.80833412
(0, 6) 96.81964892
(0, 7) 98.56783189
(0, 8) 101.34745901
(1, 0) 92.02628776
(1, 1) 97.10439252
(1, 2) 99.32066924
(1, 3) 97.24584816
(1, 4) 92.9267508
(1, 5) 92.65657752
(1, 6) 105.7197853
(1, 7) 101.23162942
(1, 8) 93.87155456
(2, 0) 95.66253664
(2, 1) 95.17750125
(2, 2) 90.93318132
(2, 3) 110.18889465
(2, 4) 98.80084371
(2, 5) 105.95297652
(2, 6) 98.37481387
(2, 7) 106.54654286
(2, 8) 107.22482426
(3, 0) 91.37294597
(3, 1) 100.96781394
(3, 2) 100.40118279
(3, 3) 113.42090475
(3, 4) 105.48508838
(3, 5) 91.6604946
(3, 6) 106.1472841
(3, 7) 95.08715803
(3, 8) 103.40412146
(4, 0) 101.20862522
(4, 1) 103.5730309
(4, 2) 100.28690912
(4, 3) 105.85269352
(4, 4) 93.37126331
(4, 5) 108.57980357
(4, 6) 100.79478953
(4, 7) 94.20019732
(4, 8) 96.10020311
(5, 0) 102.80387079
(5, 1) 98.29687616
(5, 2) 93.24376389
(5, 3) 97.24130034
(5, 4) 89.03452725
(5, 5) 96.2832753
(5, 6) 104.6034

#### Filtering

In [17]:
vals_greater_five = dataset[dataset > 105]
vals_greater_five

array([107.43534677, 105.7197853 , 110.18889465, 105.95297652,
       106.54654286, 107.22482426, 113.42090475, 105.48508838,
       106.1472841 , 105.85269352, 108.57980357, 106.71751618,
       106.39798503, 106.83273763, 106.47551845, 105.30350449,
       106.03868807, 110.44484313, 106.6471081 , 105.0320535 ,
       107.02874163, 105.07475277, 106.57364584, 107.22482426,
       107.19119932, 108.09423367, 109.40523174, 106.11454989,
       106.57052697, 105.13668343, 105.37011896, 110.44484313,
       105.86078488, 106.89005002, 106.57364584, 107.40064604,
       106.38276709, 106.46476468, 110.43976681, 105.02389857,
       106.05042487, 106.89005002])

In [18]:
vals_between_90_95 = np.extract((dataset > 90) & (dataset < 95), dataset)
vals_between_90_95

array([92.02628776, 92.9267508 , 92.65657752, 93.87155456, 90.93318132,
       91.37294597, 91.6604946 , 93.37126331, 94.20019732, 93.24376389,
       94.35373179, 92.5748759 , 91.37294597, 92.87730812, 93.87155456,
       92.75048583, 93.97853495, 91.32093303, 92.0108226 , 93.18884302,
       93.83969256, 94.5081787 , 94.59300658, 93.04610867, 91.6779221 ,
       91.37294597, 94.76253572, 94.57421727, 94.11176915, 93.97853495])

In [19]:
rows, cols = np.where(abs(dataset - 100) < 1)
# Create a list comprehension
one_away_indices = [[rows[index], cols[index]] for (index, _) in np.ndenumerate(rows)]
one_away_indices

[[0, 0],
 [1, 2],
 [3, 1],
 [3, 2],
 [4, 2],
 [4, 6],
 [6, 3],
 [6, 8],
 [8, 5],
 [9, 8],
 [10, 1],
 [10, 3],
 [10, 5],
 [12, 8],
 [13, 0],
 [13, 4],
 [13, 7],
 [14, 3],
 [14, 5],
 [15, 8],
 [16, 1],
 [16, 6],
 [17, 2],
 [17, 3],
 [18, 7],
 [18, 8],
 [20, 4],
 [21, 0],
 [21, 4],
 [21, 5],
 [22, 2],
 [23, 1],
 [23, 7]]

#### Sorting

In [20]:
# Each row will be sorted
row_sorted = np.sort(dataset)
row_sorted

array([[ 96.81964892,  97.85230675,  98.56783189,  98.74986914,
         98.80833412,  99.14931546, 101.34745901, 104.03852715,
        107.43534677],
       [ 92.02628776,  92.65657752,  92.9267508 ,  93.87155456,
         97.10439252,  97.24584816,  99.32066924, 101.23162942,
        105.7197853 ],
       [ 90.93318132,  95.17750125,  95.66253664,  98.37481387,
         98.80084371, 105.95297652, 106.54654286, 107.22482426,
        110.18889465],
       [ 91.37294597,  91.6604946 ,  95.08715803, 100.40118279,
        100.96781394, 103.40412146, 105.48508838, 106.1472841 ,
        113.42090475],
       [ 93.37126331,  94.20019732,  96.10020311, 100.28690912,
        100.79478953, 101.20862522, 103.5730309 , 105.85269352,
        108.57980357],
       [ 89.03452725,  93.24376389,  96.2832753 ,  97.24130034,
         97.62787811,  98.29687616, 101.13442416, 102.80387079,
        104.60344836],
       [ 94.35373179,  95.46493436,  98.45723272, 100.07721494,
        100.72418901, 102.9758

In [21]:
# Sort each column
col_sorted = np.sort(dataset, axis=0)
col_sorted

array([[ 91.37294597,  88.80221141,  90.93318132,  93.18884302,
         85.98839623,  91.6604946 ,  91.32093303,  92.5748759 ,
         91.37294597],
       [ 92.02628776,  91.6779221 ,  93.24376389,  94.59300658,
         89.03452725,  92.65657752,  93.04610867,  94.20019732,
         91.37294597],
       [ 94.11176915,  92.0108226 ,  93.83969256,  96.74630281,
         92.75048583,  95.19184343,  94.35373179,  94.76253572,
         93.87155456],
       [ 95.65982034,  92.87730812,  94.5081787 ,  97.24130034,
         92.9267508 ,  95.46493436,  96.50342927,  95.08715803,
         93.97853495],
       [ 95.66253664,  93.87155456,  97.75887636,  97.24584816,
         93.37126331,  95.62359311,  96.81964892,  95.85284217,
         95.19184343],
       [ 96.02548256,  94.57421727,  98.45723272,  97.62787811,
         93.97853495,  96.2832753 ,  96.89244283,  97.59572169,
         96.10020311],
       [ 96.10020311,  95.17750125,  99.32066924,  97.65393524,
         95.93799169,  96.3462

In [22]:
# create a sorted index list using a fancy indexing to keep the order of the dataset and only obtain the values of index
index_sorted = np.argsort(dataset[0])
dataset[0][index_sorted]

array([ 96.81964892,  97.85230675,  98.56783189,  98.74986914,
        98.80833412,  99.14931546, 101.34745901, 104.03852715,
       107.43534677])

#### Combining

In [26]:
# Dividimos horizontalmente en 3 partes nuestro dataset es decir si son 12 columnas serian 3 bloques de 4 columnas
thirds = np.hsplit(dataset, (3))
print(dataset.shape)
print(thirds[0].shape)
#Dividimos verticalmente el primer bloque de los 3, en 2 partes , es decir si son 10 filas serian 2 bloques de 5 filas c/u
halfed_first = np.vsplit(thirds[0], (2))
print(halfed_first[0].shape)
# Imprimimos el primer bloque de esta mitad
halfed_first[0]

(24, 9)
(24, 3)
(12, 3)


array([[ 99.14931546, 104.03852715, 107.43534677],
       [ 92.02628776,  97.10439252,  99.32066924],
       [ 95.66253664,  95.17750125,  90.93318132],
       [ 91.37294597, 100.96781394, 100.40118279],
       [101.20862522, 103.5730309 , 100.28690912],
       [102.80387079,  98.29687616,  93.24376389],
       [106.71751618, 102.97585605,  98.45723272],
       [ 96.02548256, 102.82360856, 106.47551845],
       [105.30350449,  92.87730812, 103.19258339],
       [110.44484313,  93.87155456, 101.5363647 ],
       [101.3514185 , 100.37372248, 106.6471081 ],
       [ 97.21315663, 107.02874163, 102.17642112]])

In [32]:
# Apilamos verticalmente las 2 mitades, esto nos deberia devolver el primer tercio thirds[0]
first_col = np.vstack([halfed_first[0], halfed_first[1]])
print(thirds[0] == first_col)

[[ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]]


In [33]:
# Combinamos los 3 tercios de nuestros datos que serian igual a dataset
first_second_col = np.hstack([first_col, thirds[1]])
full_data = np.hstack([first_second_col, thirds[2]])

#### Reshaping

In [36]:
# Reshape the dataset in to a single list
single_list = np.reshape(dataset, (1, -1))
print(dataset.shape)
print(single_list.shape)

(24, 9)
(1, 216)


In [37]:
# reshaping to a matrix with two columns
# -1 Tells python to figure oyt the dimension out itself
two_col_dataset = dataset.reshape(-1, 2)
print(two_col_dataset.shape)

(108, 2)
