This is the collection of functions that I think might be useful, which means they could reduce time or release our keyboard :). In other words, try to be **pythonic**.

### 1. Instance type check

* `isinstance()`

In [1]:
# I used to be
print(type('a') == str)
print(type(None) == type(None))
print(type([1, 2, 3]) == list)

True
True
True


In [2]:
# these looks better
print(isinstance('a', str))
print(None is None)
print(isinstance([1, 2, 3], list))

True
True
True


### 2. Iterate multiple iterable objects (list, etc)

* `zip()` for concate objects together

In [3]:
course_name = ['Probability', 'Inference', 'Linear Regression'
              'Stats Computing', 'Stochastic Process']
course_code = [5203, 5204, 5205, 5206, 5207]

In [4]:
# other styles
for i in range(len(course_name)):
    print(course_code[i], ': ', course_name[i])

5203 :  Probability
5204 :  Inference
5205 :  Linear RegressionStats Computing
5206 :  Stochastic Process


In [5]:
# use zip() function
for name, code in zip(course_name, course_code):
    print(code, ': ', name)

5203 :  Probability
5204 :  Inference
5205 :  Linear RegressionStats Computing
5206 :  Stochastic Process


### 3. Get index and element from single iterable object

* `enumerate()` for extracting index from single iterable object

In [6]:
# other styles
for i in range(len(course_name)):
    print(i, course_name[i])

0 Probability
1 Inference
2 Linear RegressionStats Computing
3 Stochastic Process


In [7]:
# use enumerate()
for idx, val in enumerate(course_name):
    print(idx, val)

0 Probability
1 Inference
2 Linear RegressionStats Computing
3 Stochastic Process


### 4. Plus one trick
* `np.log1p()` and `np.expm1()` for `log(x + 1)` and `exp(x) - 1`. These two functions is learned from kaggle kernels where `+1` is a common trick in many data processing tasks

In [8]:
import numpy as np
X = np.hstack((np.zeros(10000000), np.ones(10000000)))
X.shape

(20000000,)

In [9]:
%%time
X_log_1 = np.log(X + 1)

Wall time: 301 ms


In [10]:
%%time
X_log_2 = np.log1p(X)

Wall time: 245 ms


In [11]:
# check all values are equal
np.all(X_log_1 == X_log_2)

True

In [12]:
%%time
X_exp_1 = np.exp(X_log_1) - 1

Wall time: 294 ms


In [13]:
%%time
X_exp_2 = np.expm1(X_log_2)

Wall time: 243 ms


In [14]:
# check all values are equal
np.all(X_exp_1 == X_exp_2)

True

### 5. Clip ndarray
* `np.clip()` can clip the ndaray in a fixed range. This is useful, for instance in training deep neural networkds we need to clip the gradient to avoid exploding gradients.

In [15]:
a = np.arange(10)
a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [16]:
np.clip(a, 1, 6)

array([1, 1, 2, 3, 4, 5, 6, 6, 6, 6])

In [17]:
# inplace
np.clip(a, 1, 6, out=a)
a

array([1, 1, 2, 3, 4, 5, 6, 6, 6, 6])

### 6. Sampling accoding to a probability distribution
* `np.random.choice()` can sample elements from a list according to a probability distribution. Equivalent to `sample()` in `R`.

In [18]:
p = np.array([.1, .0, .7, .2])
np.random.choice([0, 1, 2, 3], p=p)

2

### 7. Split sentences
* `.split()`, no need to place ' ' inside, also '' will return error
* `.strip()` can only remove spaces at the beginning or end of the sentence

In [19]:
sentence = 'Learning python  is an interesting  process   .'
sentence.split()

['Learning', 'python', 'is', 'an', 'interesting', 'process', '.']

In [20]:
sentence.split(' ') # all extra spaces still generate ''

['Learning',
 'python',
 '',
 'is',
 'an',
 'interesting',
 '',
 'process',
 '',
 '',
 '.']

In [21]:
sentence.strip().split(' ') # all extra spaces in the middle still generate ''

['Learning',
 'python',
 '',
 'is',
 'an',
 'interesting',
 '',
 'process',
 '',
 '',
 '.']

### 8. Create dictionary from two lists

* `zip()` function

In [22]:
index = [1, 2, 3, 4, 5, 6, 7]
day = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
week = dict(zip(index, day))
week

{1: 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri', 6: 'Sat', 7: 'Sun'}

### 9. `if-else` in list comprehension

In [23]:
[True if x > 5 else False for x in range(10)]

[False, False, False, False, False, False, True, True, True, True]

### 10. Check if exists a True / all True

- `any()`
- `all()`

In [24]:
any([True if x == 1 else False for x in range(10)])

True

In [25]:
all([True if x < 15 else False for x in range(10)])

True

### 11. Unpack list / tuple / dictionary

- `*` for list and tuple
- `**` for dictionary

In [26]:
[1, *[2, 3, *[4, *[5]]]]

[1, 2, 3, 4, 5]

In [27]:
(1, *(2, 3))

(1, 2, 3)

In [28]:
x = {'x': 1}
y = {'y': 2}
z = {**x, **y}
print(z)

{'x': 1, 'y': 2}


### 12. Create a time series as dataframe

- `pd.date_range()` and `pd.DataFrame()` and `dict()`

In [29]:
import pandas as pd


x = [1, 2, 3, 4, 5]
y = [5, 4, 3, 2, 1]
time = pd.date_range('2019-12-10', freq='1D', periods=len(x))
data = pd.DataFrame(data=dict(x=x, y=y), index=time)
data

Unnamed: 0,x,y
2019-12-10,1,5
2019-12-11,2,4
2019-12-12,3,3
2019-12-13,4,2
2019-12-14,5,1


### 13. Assign new column to dataframe

- `.assign()`

In [30]:
data.assign(z=[1, 2, 3, 4, 5])

Unnamed: 0,x,y,z
2019-12-10,1,5,1
2019-12-11,2,4,2
2019-12-12,3,3,3
2019-12-13,4,2,4
2019-12-14,5,1,5


### To be continued... 