# Introduction to Numpy and Scipy

In [21]:
import numpy as np
import pandas as pd

# We'll demo a bit of Scipy
import scipy.special

import iqplot

import bokeh.io
import bokeh.plotting

bokeh.io.output_notebook()

In [4]:
np.array([1, 2, 3, 4])
# convert list. Numpy as a lot of function

array([1, 2, 3, 4])

In [5]:
my_ar = np.array([1, 2, 3, 4])

In [7]:
my_ar.dtype
# Datatype of the element in the array

dtype('int64')

In [9]:
my_ar.shape

(4,)

In [10]:
my_ar.astype(float)
# convert type elements

array([1., 2., 3., 4.])

In [11]:
my_ar.std()

1.118033988749895

In [12]:
np.std(my_ar)

1.118033988749895

In [14]:
np.array([0 for i in range(10)])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [16]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [17]:
my_ar = np.array([[1, 2],[3, 4]])
np.zeros_like(my_ar)

array([[0, 0],
       [0, 0]])

Sometimes convert pandas df in numpy array, array more efficient for stats, some processing

In [22]:
df = pd.read_csv('data/c_elegans_egg_xa.csv', comment='#')

In [23]:
df.head()

Unnamed: 0,food,area (sq. um)
0,high,1683
1,high,2061
2,high,1792
3,high,1852
4,high,2091


In [25]:
p = iqplot.strip(
    data=df,
    q="area (sq. um)",
    cats="food",
    order=["low", "high"],
    jitter=True,
    y_axis_label="amount of food",
    frame_height=200,
)

bokeh.io.show(p)

In [26]:
bokeh.io.show(iqplot.ecdf(
    data=df,
    q='area (sq. um)',
    cats='food',
    order=['low', 'high'],
)
)

In [27]:
#  C. elegans eggs from high concentration of food shifter to the right
# worms eating more food have smaller eggs.

In [29]:
bokeh.io.show(iqplot.ecdf(
    data=df,
    q='area (sq. um)',
    cats='food',
    order=['low', 'high'],
    conf_int=True
)
)
# confidence interval ecdf

In [30]:
df.head()

Unnamed: 0,food,area (sq. um)
0,high,1683
1,high,2061
2,high,1792
3,high,1852
4,high,2091


In [33]:
xa_high = df.loc[df['food'] == 'high', 'area (sq. um)'].values
xa_low = df.loc[df['food'] == 'low', 'area (sq. um)'].values

In [34]:
xa_low

array([1840, 2090, 2169, 1988, 2212, 2339, 1989, 2144, 2290, 1920, 2280,
       1809, 2158, 1800, 2133, 2060, 2160, 2001, 2030, 2088, 1951, 2460,
       2021, 2010, 2139, 2160, 2106, 2171, 2113, 2179, 1890, 2179, 2021,
       1969, 2150, 1900, 2267, 1711, 1901, 2114, 2112, 2361, 2130, 2061,
       2121, 1832, 2210, 2130, 2153, 2009, 2100, 2252, 2143, 2252, 2222,
       2121, 2409])

In [35]:
xa_high[::-1]

array([1828, 2131, 1851, 2030, 1930, 1660, 1721, 1740, 1752, 1863, 2141,
       1701, 1661, 1712, 1749, 1642, 1882, 1821, 1800, 1692, 1680, 1671,
       1683, 1833, 1800, 1930, 1910, 1821, 1840, 1787, 1683, 1809, 1951,
       1892, 1731, 1751, 1802, 1912, 1781, 2091, 1852, 1792, 2061, 1683])

In [36]:
xa_high[3:5]

array([1852, 2091])

In [37]:
xa_high[10:20]

array([1892, 1951, 1809, 1683, 1787, 1840, 1821, 1910, 1930, 1800])

In [39]:
# Fancy indexing
xa_high[[1, 19, 6]]

array([2061, 1800, 1912])

In [40]:
# Boolean indexing
np.where(xa_high > 2000)

(array([ 1,  4, 33, 40, 42]),)

In [41]:
xa_high[{np.array([1, 4, 33, 40, 42],)}]#miss end

TypeError: unhashable type: 'numpy.ndarray'

In [42]:
my_ar = np.array([1, 2, 3, 4])

In [43]:
my_ar

array([1, 2, 3, 4])

In [46]:
my_ar[2] = 6

my_ar
#mutable

array([1, 2, 6, 4])

In [47]:
# Attach a new variable
my_ar2 = my_ar

# Set an entry using the new variable
my_ar2[3] = 9

# Does the original change? (yes.)
my_ar

array([1, 2, 6, 9])

In [48]:
# Make list and array
my_list = [1, 2, 3, 4]
my_ar = np.array(my_list)

# Slice out of each
my_list_slice = my_list[1:-1]
my_ar_slice = my_ar[1:-1]

# Mess with the slices
my_list_slice[0] = 9
my_ar_slice[0] = 9

# Look at originals
print(my_list)
print(my_ar)

[1, 2, 3, 4]
[1 9 3 4]


In [50]:
np.array([5, 6, 7, 8]) / np.array([1, 2, 3, 4]) # multiply by element

array([5.        , 3.        , 2.33333333, 2.        ])

In [52]:
-4 * xa_high

array([-6732, -8244, -7168, -7408, -8364, -7124, -7648, -7208, -7004,
       -6924, -7568, -7804, -7236, -6732, -7148, -7360, -7284, -7640,
       -7720, -7200, -7332, -6732, -6684, -6720, -6768, -7200, -7284,
       -7528, -6568, -6996, -6848, -6644, -6804, -8564, -7452, -7008,
       -6960, -6884, -6640, -7720, -8120, -7404, -8524, -7312])

In [53]:
my_ar = xa_high.reshape((11,4))

In [54]:
my_ar

array([[1683, 2061, 1792, 1852],
       [2091, 1781, 1912, 1802],
       [1751, 1731, 1892, 1951],
       [1809, 1683, 1787, 1840],
       [1821, 1910, 1930, 1800],
       [1833, 1683, 1671, 1680],
       [1692, 1800, 1821, 1882],
       [1642, 1749, 1712, 1661],
       [1701, 2141, 1863, 1752],
       [1740, 1721, 1660, 1930],
       [2030, 1851, 2131, 1828]])

In [55]:
my_ar[2,1] # row, column

1731

In [56]:
my_ar[2, :]

array([1751, 1731, 1892, 1951])

In [57]:
xa_high

array([1683, 2061, 1792, 1852, 2091, 1781, 1912, 1802, 1751, 1731, 1892,
       1951, 1809, 1683, 1787, 1840, 1821, 1910, 1930, 1800, 1833, 1683,
       1671, 1680, 1692, 1800, 1821, 1882, 1642, 1749, 1712, 1661, 1701,
       2141, 1863, 1752, 1740, 1721, 1660, 1930, 2030, 1851, 2131, 1828])

In [58]:
xa_low

array([1840, 2090, 2169, 1988, 2212, 2339, 1989, 2144, 2290, 1920, 2280,
       1809, 2158, 1800, 2133, 2060, 2160, 2001, 2030, 2088, 1951, 2460,
       2021, 2010, 2139, 2160, 2106, 2171, 2113, 2179, 1890, 2179, 2021,
       1969, 2150, 1900, 2267, 1711, 1901, 2114, 2112, 2361, 2130, 2061,
       2121, 1832, 2210, 2130, 2153, 2009, 2100, 2252, 2143, 2252, 2222,
       2121, 2409])

In [59]:
np.concatenate((xa_high,xa_low))

array([1683, 2061, 1792, 1852, 2091, 1781, 1912, 1802, 1751, 1731, 1892,
       1951, 1809, 1683, 1787, 1840, 1821, 1910, 1930, 1800, 1833, 1683,
       1671, 1680, 1692, 1800, 1821, 1882, 1642, 1749, 1712, 1661, 1701,
       2141, 1863, 1752, 1740, 1721, 1660, 1930, 2030, 1851, 2131, 1828,
       1840, 2090, 2169, 1988, 2212, 2339, 1989, 2144, 2290, 1920, 2280,
       1809, 2158, 1800, 2133, 2060, 2160, 2001, 2030, 2088, 1951, 2460,
       2021, 2010, 2139, 2160, 2106, 2171, 2113, 2179, 1890, 2179, 2021,
       1969, 2150, 1900, 2267, 1711, 1901, 2114, 2112, 2361, 2130, 2061,
       2121, 1832, 2210, 2130, 2153, 2009, 2100, 2252, 2143, 2252, 2222,
       2121, 2409])

In [60]:
np.exp(xa_high / 1000)

array([5.38167681, 7.8538197 , 6.00144336, 6.37255189, 8.09300412,
       5.93578924, 6.76660849, 6.06175887, 5.76036016, 5.64629738,
       6.63262067, 7.03571978, 6.10434004, 5.38167681, 5.97151103,
       6.29653826, 6.1780334 , 6.7530888 , 6.88951024, 6.04964746,
       6.2526164 , 5.38167681, 5.31748262, 5.36555597, 5.43033051,
       6.04964746, 6.1780334 , 6.56662499, 5.16549017, 5.74885095,
       5.54003047, 5.26457279, 5.47942408, 8.50794132, 6.44303692,
       5.7661234 , 5.69734342, 5.59011579, 5.25931084, 6.88951024,
       7.61408636, 6.36618252, 8.42328589, 6.22143134])

In [61]:
np.cos(xa_high)

array([ 0.62656192,  0.9933696 ,  0.27501843,  0.03112568,  0.26681725,
       -0.96021239, -0.33430744,  0.29228295, -0.42404251, -0.99984597,
        0.72399324, -0.99748325,  0.84865001,  0.62656192, -0.84393482,
        0.56257847,  0.43231386,  0.99610114,  0.48702972, -0.99122275,
       -0.11903049,  0.62656192,  0.94691648, -0.73027654, -0.24968607,
       -0.99122275,  0.43231386, -0.98275172, -0.49500319, -0.64703425,
       -0.98592179, -0.61963892, -0.17156886,  0.00460656, -0.99936794,
        0.53296056,  0.90375673,  0.82939405,  0.3256673 ,  0.48702972,
        0.86222727, -0.824246  ,  0.5401501 ,  0.91834245])

In [62]:
np.dot(xa_high, xa_high)

146360195

In [63]:
np.pi

3.141592653589793

In [64]:
scipy.special.erf(xa_high / 2000)

array([0.76597747, 0.8549794 , 0.7948931 , 0.80965587, 0.86074212,
       0.79209865, 0.8236209 , 0.79740973, 0.78433732, 0.77904847,
       0.81905337, 0.83227948, 0.79915793, 0.76597747, 0.7936263 ,
       0.80676772, 0.8021292 , 0.82316805, 0.8276577 , 0.79690821,
       0.80506817, 0.76597747, 0.76262579, 0.76514271, 0.76846912,
       0.79690821, 0.8021292 , 0.81673693, 0.7543863 , 0.78381257,
       0.77393853, 0.75980693, 0.77094188, 0.86995276, 0.81227529,
       0.78459935, 0.78143985, 0.77636944, 0.75952376, 0.8276577 ,
       0.84883448, 0.80941641, 0.86814949, 0.80384751])

In [26]:
#scipy.special.    many options
#scipy.integrate
#scipy.interpolate

import scipy.stats

In [66]:
scipy.stats.ttest_ind(xa_high, xa_low, equal_var=False)

Ttest_indResult(statistic=-9.890647593054966, pvalue=2.0431526287128887e-16)

In [67]:
# Ordering matters for times series!

In [68]:
df = pd.read_csv('data/retina_spikes.csv', comment='#')

df.head()

Unnamed: 0,t (ms),V (uV)
0,703.96,4.79
1,704.0,-0.63
2,704.04,5.83
3,704.08,0.31
4,704.12,-4.58


In [69]:
len(df)

20001

In [72]:
p = bokeh.plotting.figure(
    frame_height=150,
    frame_width=600,
    toolbar_location='above',
    x_axis_label="t (ms)",
    y_axis_label="V (µV)",
)
p.line(
    source=df,
    x="t (ms)",
    y="V (uV)",
)

bokeh.io.show(p)

In [73]:
x = np.linspace(-15, 15, 400)

norm_I = 4 * (scipy.special.j1(x) /x) ** 2

In [74]:
p =  bokeh.plotting.figure(
    frame_width = 550,
    frame_height= 250,
    x_axis_label="x",
    y_axis_label='$$I(x)/I_0$$', # Latex
)

p.line(
    x=x,
    y=norm_I,
    line_width=2,
)

bokeh.io.show(p)


## Random number generation

In [2]:
np.random.uniform(low=0, high=1, size=10)

array([0.52854999, 0.54193159, 0.22926647, 0.49763769, 0.90976601,
       0.33625287, 0.45972976, 0.83106814, 0.43038472, 0.96084313])

In [4]:
x = np.random.uniform(low=0, high=1, size=10)

p = iqplot.ecdf(x)

bokeh.io.show(p)

In [5]:
x = np.random.uniform(low=0, high=1, size=10000) #More data closer to uniform distribution

p = iqplot.ecdf(x)

bokeh.io.show(p)

In [10]:
x = np.random.uniform(low=0, high=1, size=10)

heads = x < 0.5
heads

array([False,  True,  True, False,  True, False, False, False,  True,
        True])

In [11]:
rg = np.random.default_rng(3252)

In [13]:
rg.uniform(low=0, high=1, size=10)

array([0.18866535, 0.04418857, 0.02961285, 0.22083971, 0.43341773,
       0.13166813, 0.42112164, 0.43507845, 0.61380912, 0.30627603])

In [15]:
rg = np.random.default_rng(3253) # Seed can change drastically results (no link between seeds)
rg.uniform(low=0, high=1, size=10)
# after seed can have the same number generated between different computers...

array([0.31390226, 0.73012457, 0.05800998, 0.01557021, 0.29825701,
       0.10106784, 0.06329107, 0.58614237, 0.52023168, 0.52779988])

In [16]:
mu = 10
sigma = 1

x = rg.normal(mu, sigma, 100000)

p = iqplot.histogram(
    x,
    density=True,
    rug=False,
    bins=100
)

bokeh.io.show(p)

In [18]:
mu = 10
sigma = 1

x = rg.normal(mu, sigma, 100000)

p = iqplot.ecdf(
    x[:1000], # we do not want too much data points
)

bokeh.io.show(p)

In [32]:
# Overly theorethical plot
mu = 10
sigma = 1

x = rg.normal(mu, sigma, 100000)

x_theor = np.linspace(6, 14, 400)
y_theor = scipy.stats.norm.cdf(x_theor, mu, sigma)

p = iqplot.ecdf(
    x[:5000], 
)

p.line(
    x=x_theor,
    y=y_theor,
    line_width=2,
    line_color="tomato",
)

bokeh.io.show(p)

In [33]:
scipy.stats.binom.pmf(10, 10, 0.5)

0.0009765625

In [34]:
flip_results = rg.binomial(10, 0.5, size=10_000_000)

In [36]:
np.sum(flip_results == 10) / len(flip_results) # want the stat

0.0009727

In [38]:
rg.choice(np.arange(52), size=20) # arange like range in python.

array([22, 12, 18, 22, 49,  9, 12, 32, 25, 47, 28, 23, 45,  2, 40, 41,  6,
       40, 44,  9])

In [42]:
''.join(rg.choice(list('ATGC'), replace=True, size=70)) 
# Generate rsequence

'AATTTCAAGTTCATACGTTTCCAAATGCCTATCTGTGTTTCGGCGTCTCGCATTAACGCAGATCAGGACG'

In [44]:
rg.permutation(np.arange(52))

array([34, 41,  7, 50, 11, 51, 28,  2, 35, 16, 21, 13, 45, 15, 25,  5, 20,
        8, 29,  6,  1, 36, 19, 30, 26, 22,  3, 46, 18, 40, 44, 48, 12, 31,
       27, 39, 24, 42, 17, 33, 23,  4, 38, 47, 37, 49, 14, 32,  0, 43,  9,
       10])