# Chapter 1 Elegant Numpy

## 1.1 Gene Expression and Data

In [6]:
def rpkm(counts, lengths):
    
    normed = 1e9 * C / (N[np.newaxis, :] * L[:, np.newaxis])
    
    return(normed)

In [7]:
gene0 = [100, 200]
gene1 = [50, 0]
gene2 = [350, 100]
expression_data = [gene0, gene1, gene2]

In [8]:
expression_data[2][0]

350

In [1]:
import numpy as np
array1d = np.array([1, 2, 3, 4])
print(array1d)

[1 2 3 4]


In [2]:
print(type(array1d))

<class 'numpy.ndarray'>


In [3]:
print(array1d.shape)

(4,)


In [9]:
array2d = np.array(expression_data)
print(array2d)
print(array2d.shape)
print(type(array2d))

[[100 200]
 [ 50   0]
 [350 100]]
(3, 2)
<class 'numpy.ndarray'>


In [10]:
print(array2d.ndim)

2


## 1.2 Numpy N-dimension Array

In [11]:
import numpy as np

# integer range(0~999,999) ndarray
array = np.arange(1e6)

# transform to python list
list_array = array.tolist()

In [12]:
%timeit -n10 y = [val * 5 for val in list_array]

35.5 ms ± 1.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
%timeit -n10 x = array * 5

758 µs ± 398 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
# ndarray x formation
x = np.array([1, 2, 3], np.int32)
print(x)

[1 2 3]


In [15]:
# slice x formation
y = x[:2]
print(y)

[1 2]


In [16]:
# allocate 6 to first element of y
y[0] = 6
print(y)

[6 2]


In [17]:
# now first element of y changed to 6
print(x)

[6 2 3]


In [18]:
y = np.copy(x[:2])

In [19]:
x = np.array([1, 2, 3, 4])
print(x * 2)

[2 4 6 8]


In [22]:
y = np.array([0, 1, 2, 1])
print(x + y)

[1 3 5 5]


In [23]:
x = np.array([1, 2, 3, 4])
x = np.reshape(x, (len(x), 1))
print(x)

[[1]
 [2]
 [3]
 [4]]


In [25]:
y = np.array([0, 1, 2, 1])
y = np.reshape(y, (1, len(y)))
print(y)

[[0 1 2 1]]


In [26]:
print(x.shape)
print(y.shape)

(4, 1)
(1, 4)


In [27]:
outer = x * y
print(outer)

[[0 1 2 1]
 [0 2 4 2]
 [0 3 6 3]
 [0 4 8 4]]


In [28]:
print(outer.shape)

(4, 4)


## 1.3 Gene Expression Dataset

In [29]:
import numpy as np
import pandas as pd

filename = 'data/counts.txt'
with open(filename, 'rt') as f:
    data_table = pd.read_csv(f, index_col=0)
    
print(data_table.iloc[:5, :5])

       00624286-41dd-476f-a63b-d2a5f484bb45  TCGA-FS-A1Z0  TCGA-D9-A3Z1  \
A1BG                                1272.36        452.96        288.06   
A1CF                                   0.00          0.00          0.00   
A2BP1                                  0.00          0.00          0.00   
A2LD1                                164.38        552.43        201.83   
A2ML1                                 27.00          0.00          0.00   

       02c76d24-f1d2-4029-95b4-8be3bda8fdbe  TCGA-EB-A51B  
A1BG                                 400.11        420.46  
A1CF                                   1.00          0.00  
A2BP1                                  0.00          1.00  
A2LD1                                165.12         95.75  
A2ML1                                  0.00          8.00  


In [30]:
# sample name
samples = list(data_table.columns)

In [31]:
# import gene length data
with open('data/genes.csv', 'rt') as f:
    # parsing pandas file and indexing GeneSymbol
    gene_info = pd.read_csv(f, index_col=0)
print(gene_info.iloc[:5, :])

            GeneID  GeneLength
GeneSymbol                    
CPA1          1357        1724
GUCY2D        3000        3623
UBC           7316        2687
C11orf95     65998        5581
ANKMY2       57037        2611


In [33]:
print("data_table gene : ", data_table.shape[0])
print("gene_info gene : ", gene_info.shape[0])

data_table gene :  20500
gene_info gene :  20503


In [34]:
# subset of gene data that match with number data
matched_index = pd.Index.intersection(data_table.index, gene_info.index)

In [35]:
# 2D array that includes expression rate about each gene for each species
counts = np.asarray(data_table.loc[matched_index], dtype=int)

gene_names = np.array(matched_index)

# check gene amount and species amount
print(f'{counts.shape[0]} genes were counted in {counts.shape[1]} species')

20500 genes were counted in 375 species


In [38]:
# 1D ndarray include each gene length
gene_lengths = np.asarray(gene_info.loc[matched_index]['GeneLength'], dtype=int)

In [39]:
print(counts.shape)
print(gene_lengths.shape)

(20500, 375)
(20500,)


## 1.4 Normalization