In [1]:
import numpy as np
import pandas as pd

### Numpy

Quickstart tutorial: https://numpy.org/doc/stable/user/quickstart.html

NumPy’s main object is the homogeneous multidimensional array. It is a table of elements (usually numbers), all of the same type, indexed by a tuple of positive integers. In NumPy dimensions are called axes. The number of axes is rank.

In [2]:
# create a 2 (row) x 3 (column) array from a list of lists. Type can be specified or inferred.
a=np.asarray([[1,2.5,3],[4,5,6]],dtype='float32')
print(a)
print(a.ndim)
print(a.shape)

print()

# a's transpose is a 3 x 2 array. The value of a is not changed by this.
aT = a.transpose()
print(aT)
print(aT.ndim)
print(aT.shape)

[[1.  2.5 3. ]
 [4.  5.  6. ]]
2
(2, 3)

[[1.  4. ]
 [2.5 5. ]
 [3.  6. ]]
2
(3, 2)


In [3]:
a.dtype

dtype('float32')

In [4]:
# Example of inferred type.  Here it assumes that these are integers... 
np.asarray([1,2]).dtype

dtype('int64')

In [5]:
# ... unless you specify otherwise
np.asarray([1,2],dtype="complex")

array([1.+0.j, 2.+0.j])

In [6]:
# exponential and logarithm (base e)
print(np.exp(1))
print(np.log(np.exp(1)))

2.718281828459045
1.0


In [7]:
# built-in constants
np.pi

3.141592653589793

In [8]:
# generate an m x n array of random numbers, uniform on [0,1]
np.random.rand(4,2)

array([[0.4234983 , 0.37689607],
       [0.46055412, 0.13738208],
       [0.9055978 , 0.61184153],
       [0.40573677, 0.98094182]])

In [9]:
# standard trigonmetric operations
np.sin(np.pi/2)

1.0

In [10]:
# generate a uniformly spaced 1-D array (start,end,number of elements)
# notice that both endpoints are included by default
np.linspace(0,2,9)

array([0.  , 0.25, 0.5 , 0.75, 1.  , 1.25, 1.5 , 1.75, 2.  ])

In [11]:
# same idea, but take base to that power
# again, both endpoints are included by default
np.logspace(0,2,9,base=10)

array([  1.        ,   1.77827941,   3.16227766,   5.62341325,
        10.        ,  17.7827941 ,  31.6227766 ,  56.23413252,
       100.        ])

In [12]:
# array of ones (input is a tuple of dimensions)
np.ones((2,3))

array([[1., 1., 1.],
       [1., 1., 1.]])

In [13]:
# element-wise operations

# add or multiply two matrices elementwise; returns ValueError if sizes do not match
print(np.ones((2,3))+np.ones((2,3)))
print()
print(np.ones((2,3))*np.ones((2,3)))
print()

# casts the constant into an appropriately sized matrix
print(np.ones((2,3))+1)
print()

print(np.ones((2,3))*5)

[[2. 2. 2.]
 [2. 2. 2.]]

[[1. 1. 1.]
 [1. 1. 1.]]

[[2. 2. 2.]
 [2. 2. 2.]]

[[5. 5. 5.]
 [5. 5. 5.]]


In [14]:
# comparison operation is also performed element-wise; can compare to constant
np.random.rand(5,5) > 0.8

array([[False, False, False, False, False],
       [False, False, False, False, False],
       [ True, False, False, False, False],
       [False,  True,  True, False, False],
       [False, False, False,  True,  True]])

In [15]:
# again, sin is performed element-wise
np.sin(np.random.rand(2,2))

array([[0.68578447, 0.04257091],
       [0.65831239, 0.21495348]])

In [16]:
# So is division.  Vector is cast into matrix with appropriate number of rows  
print(np.asarray([0.1,0.2]))
print()
print(np.ones((2,2))/np.asarray([0.1,0.2]))

[0.1 0.2]

[[10.  5.]
 [10.  5.]]


In [17]:
# The new shape should be compatible with the original shape. 
# If an integer, then the result will be a 1-D array of that length. 
# One shape dimension can be -1. In this case, the value is inferred from the length of the array and remaining dimension
print(np.asarray([0.1,0.2]).reshape(-1,1))
print()
print(np.ones((2,2))/np.asarray([0.1,0.2]).reshape(-1,1))

[[0.1]
 [0.2]]

[[10. 10.]
 [ 5.  5.]]


In [18]:
q = np.ones((2,2))/np.asarray([0.1,0.2]).reshape(-1,1)

# min of each column
print(q.min(axis=0))

# min of each row
print(q.min(axis=1))


[5. 5.]
[10.  5.]


In [19]:
tmp=np.ones((3,3))*2
tmp

array([[2., 2., 2.],
       [2., 2., 2.],
       [2., 2., 2.]])

In [20]:
tmp.cumsum(axis=0)

array([[2., 2., 2.],
       [4., 4., 4.],
       [6., 6., 6.]])

In [21]:
tmp.cumprod(axis=1)

array([[2., 4., 8.],
       [2., 4., 8.],
       [2., 4., 8.]])

In [22]:
# reshape into 1-D vector
tmp.reshape(-1)

array([2., 2., 2., 2., 2., 2., 2., 2., 2.])

In [23]:
# turn a 4 x 4 into a 2 x 2 x 2 x 2
np.random.rand(4,4).reshape((2,2,2,2))

array([[[[0.32970639, 0.65710298],
         [0.95356986, 0.43478444]],

        [[0.77176522, 0.47348459],
         [0.56648799, 0.6750343 ]]],


       [[[0.14021142, 0.09985012],
         [0.89529147, 0.72639876]],

        [[0.06671772, 0.11825251],
         [0.60239102, 0.94662303]]]])

In [24]:
# selection of elements from array
tmp= np.random.rand(4,4)
print(tmp)
print()
print(tmp.reshape(-1)[np.asarray([2,3,4,5])])

[[0.61251778 0.69481483 0.53536078 0.34951077]
 [0.04074578 0.02107799 0.7733813  0.26459013]
 [0.92856296 0.0484834  0.54946275 0.89253744]
 [0.75800575 0.46726955 0.18297051 0.5752995 ]]

[0.53536078 0.34951077 0.04074578 0.02107799]


In [25]:
# slice
tmp[1:4,1:3]

array([[0.02107799, 0.7733813 ],
       [0.0484834 , 0.54946275],
       [0.46726955, 0.18297051]])

In [26]:
# pick out all elements satisfying condition as 1-D array
tmp[tmp>0.5]

array([0.61251778, 0.69481483, 0.53536078, 0.7733813 , 0.92856296,
       0.54946275, 0.89253744, 0.75800575, 0.5752995 ])

In [27]:
# pick out rows where first element is >0.5
tmp[tmp[:,0]>0.5,:]

array([[0.61251778, 0.69481483, 0.53536078, 0.34951077],
       [0.92856296, 0.0484834 , 0.54946275, 0.89253744],
       [0.75800575, 0.46726955, 0.18297051, 0.5752995 ]])

In [28]:
tmp

array([[0.61251778, 0.69481483, 0.53536078, 0.34951077],
       [0.04074578, 0.02107799, 0.7733813 , 0.26459013],
       [0.92856296, 0.0484834 , 0.54946275, 0.89253744],
       [0.75800575, 0.46726955, 0.18297051, 0.5752995 ]])

In [29]:
# Assign values to a subset of array elements.  Note that this happens in place (i.e., the value of tmp is changed)
tmp.reshape(-1)[[1,2,3,4]]=0

In [30]:
tmp

array([[0.61251778, 0.        , 0.        , 0.        ],
       [0.        , 0.02107799, 0.7733813 , 0.26459013],
       [0.92856296, 0.0484834 , 0.54946275, 0.89253744],
       [0.75800575, 0.46726955, 0.18297051, 0.5752995 ]])

In [31]:
# Linear Algebra
a = np.array([[1.0, 2.0], [3.0, 4.0]])

In [32]:
# element-wise
a*a

array([[ 1.,  4.],
       [ 9., 16.]])

In [33]:
# element-wise
a/a

array([[1., 1.],
       [1., 1.]])

In [34]:
# Standard matrix multiplication.  Not element-wise!
np.dot(a,a)

array([[ 7., 10.],
       [15., 22.]])

In [35]:
np.eye(2)

array([[1., 0.],
       [0., 1.]])

In [36]:
print(a)
np.trace(a)

[[1. 2.]
 [3. 4.]]


5.0

### Pandas 

10 Minutes to pandas: http://pandas.pydata.org/pandas-docs/stable/10min.html



In [37]:
# Object Creation:

# series
s = pd.Series([1,3,5,np.nan,6,8])

# dataframe
dates = pd.date_range('20240101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [38]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [39]:
df

Unnamed: 0,A,B,C,D
2024-01-01,0.805956,-0.305918,0.983338,-2.441024
2024-01-02,-1.151119,0.958722,0.149249,0.621109
2024-01-03,0.06895,0.192157,-0.133623,-0.262616
2024-01-04,0.116208,1.537014,0.836505,0.558891
2024-01-05,-1.966089,-2.21461,-0.237079,0.165809
2024-01-06,-0.569517,-0.039374,0.557603,-1.189776


In [40]:
df2 = pd.DataFrame({ 'A' : 1.,
   ....:                      'B' : pd.Timestamp('20240102'),
   ....:                      'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
   ....:                      'D' : np.array([3] * 4,dtype='int32'),
   ....:                      'E' : pd.Categorical(["test","train","test","train"]),
   ....:                      'F' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2024-01-02,1.0,3,test,foo
1,1.0,2024-01-02,1.0,3,train,foo
2,1.0,2024-01-02,1.0,3,test,foo
3,1.0,2024-01-02,1.0,3,train,foo


In [41]:
#Viewing Data
df.head(2)

Unnamed: 0,A,B,C,D
2024-01-01,0.805956,-0.305918,0.983338,-2.441024
2024-01-02,-1.151119,0.958722,0.149249,0.621109


In [42]:
df.index

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')

In [43]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [44]:
# Sorting
print(df)
print()

# these operations do not change the value of df

# sort rows by index
print(df.sort_index(axis=0, ascending=False)) 
print()

# sort columns by column header
print(df.sort_index(axis=1, ascending=False))

                   A         B         C         D
2024-01-01  0.805956 -0.305918  0.983338 -2.441024
2024-01-02 -1.151119  0.958722  0.149249  0.621109
2024-01-03  0.068950  0.192157 -0.133623 -0.262616
2024-01-04  0.116208  1.537014  0.836505  0.558891
2024-01-05 -1.966089 -2.214610 -0.237079  0.165809
2024-01-06 -0.569517 -0.039374  0.557603 -1.189776

                   A         B         C         D
2024-01-06 -0.569517 -0.039374  0.557603 -1.189776
2024-01-05 -1.966089 -2.214610 -0.237079  0.165809
2024-01-04  0.116208  1.537014  0.836505  0.558891
2024-01-03  0.068950  0.192157 -0.133623 -0.262616
2024-01-02 -1.151119  0.958722  0.149249  0.621109
2024-01-01  0.805956 -0.305918  0.983338 -2.441024

                   D         C         B         A
2024-01-01 -2.441024  0.983338 -0.305918  0.805956
2024-01-02  0.621109  0.149249  0.958722 -1.151119
2024-01-03 -0.262616 -0.133623  0.192157  0.068950
2024-01-04  0.558891  0.836505  1.537014  0.116208
2024-01-05  0.165809 -0.23707

In [45]:
print(df)
print()

# sort rows using values in a particular column
print(df.sort_values(by='B')) # axis defaults to 0
print()

# sort columns using values corresponding to a particular row index
print(df.sort_values(axis=1,by='2024-01-03'))

                   A         B         C         D
2024-01-01  0.805956 -0.305918  0.983338 -2.441024
2024-01-02 -1.151119  0.958722  0.149249  0.621109
2024-01-03  0.068950  0.192157 -0.133623 -0.262616
2024-01-04  0.116208  1.537014  0.836505  0.558891
2024-01-05 -1.966089 -2.214610 -0.237079  0.165809
2024-01-06 -0.569517 -0.039374  0.557603 -1.189776

                   A         B         C         D
2024-01-05 -1.966089 -2.214610 -0.237079  0.165809
2024-01-01  0.805956 -0.305918  0.983338 -2.441024
2024-01-06 -0.569517 -0.039374  0.557603 -1.189776
2024-01-03  0.068950  0.192157 -0.133623 -0.262616
2024-01-02 -1.151119  0.958722  0.149249  0.621109
2024-01-04  0.116208  1.537014  0.836505  0.558891

                   D         C         A         B
2024-01-01 -2.441024  0.983338  0.805956 -0.305918
2024-01-02  0.621109  0.149249 -1.151119  0.958722
2024-01-03 -0.262616 -0.133623  0.068950  0.192157
2024-01-04  0.558891  0.836505  0.116208  1.537014
2024-01-05  0.165809 -0.23707

In [46]:
# Descriptive statistics of each column
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.449269,0.021332,0.359332,-0.424601
std,0.997166,1.289604,0.509752,1.19055
min,-1.966089,-2.21461,-0.237079,-2.441024
25%,-1.005718,-0.239282,-0.062905,-0.957986
50%,-0.250284,0.076392,0.353426,-0.048404
75%,0.104393,0.767081,0.766779,0.460621
max,0.805956,1.537014,0.983338,0.621109


In [47]:
# Selection

# select a particular column (with row index) 
df['A']

2024-01-01    0.805956
2024-01-02   -1.151119
2024-01-03    0.068950
2024-01-04    0.116208
2024-01-05   -1.966089
2024-01-06   -0.569517
Freq: D, Name: A, dtype: float64

In [48]:
# select rows by slicing
print(df[1:3])
print()

# equivalent but more flexible; select rows by integer positions
print(df.iloc[1:3,:])
print()

# subsets of rows and columns, can slice or list
print(df.iloc[1:3,[1,3]])

                   A         B         C         D
2024-01-02 -1.151119  0.958722  0.149249  0.621109
2024-01-03  0.068950  0.192157 -0.133623 -0.262616

                   A         B         C         D
2024-01-02 -1.151119  0.958722  0.149249  0.621109
2024-01-03  0.068950  0.192157 -0.133623 -0.262616

                   B         D
2024-01-02  0.958722  0.621109
2024-01-03  0.192157 -0.262616


In [49]:
# select rows or columns by value
df.loc[dates[0]] # equivalent to df.loc['2013-01-01']

A    0.805956
B   -0.305918
C    0.983338
D   -2.441024
Name: 2024-01-01 00:00:00, dtype: float64

In [50]:
# can select row and column values
df.loc[:,['A','C']]

Unnamed: 0,A,C
2024-01-01,0.805956,0.983338
2024-01-02,-1.151119,0.149249
2024-01-03,0.06895,-0.133623
2024-01-04,0.116208,0.836505
2024-01-05,-1.966089,-0.237079
2024-01-06,-0.569517,0.557603


In [51]:
# note that the end value is included for loc...
df.loc['20240102':'20240104',['A','B']]

Unnamed: 0,A,B
2024-01-02,-1.151119,0.958722
2024-01-03,0.06895,0.192157
2024-01-04,0.116208,1.537014


In [52]:
# ... but not for iloc
df.iloc[1:4,0:2]

Unnamed: 0,A,B
2024-01-02,-1.151119,0.958722
2024-01-03,0.06895,0.192157
2024-01-04,0.116208,1.537014


In [53]:
# Boolean Indexing

# select all rows where a condition is met
df[df.A > 0]

Unnamed: 0,A,B,C,D
2024-01-01,0.805956,-0.305918,0.983338,-2.441024
2024-01-03,0.06895,0.192157,-0.133623,-0.262616
2024-01-04,0.116208,1.537014,0.836505,0.558891


In [54]:
# Filling in missing values
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[1]:dates[2],'E'] = 1
df1.loc[[dates[0],dates[3]],'E'] = 2

In [55]:
df1

Unnamed: 0,A,B,C,D,E
2024-01-01,0.805956,-0.305918,0.983338,-2.441024,2.0
2024-01-02,-1.151119,0.958722,0.149249,0.621109,1.0
2024-01-03,0.06895,0.192157,-0.133623,-0.262616,1.0
2024-01-04,0.116208,1.537014,0.836505,0.558891,2.0


In [56]:
# Dropping rows with any missing values
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[1]:dates[2],'E'] = 1
print(df1)
print()
print(df1.dropna(how='any'))

                   A         B         C         D    E
2024-01-01  0.805956 -0.305918  0.983338 -2.441024  NaN
2024-01-02 -1.151119  0.958722  0.149249  0.621109  1.0
2024-01-03  0.068950  0.192157 -0.133623 -0.262616  1.0
2024-01-04  0.116208  1.537014  0.836505  0.558891  NaN

                   A         B         C         D    E
2024-01-02 -1.151119  0.958722  0.149249  0.621109  1.0
2024-01-03  0.068950  0.192157 -0.133623 -0.262616  1.0


In [57]:
# Fill in all missing values with a given value
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[1]:dates[2],'E'] = 1
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2024-01-01,0.805956,-0.305918,0.983338,-2.441024,5.0
2024-01-02,-1.151119,0.958722,0.149249,0.621109,1.0
2024-01-03,0.06895,0.192157,-0.133623,-0.262616,1.0
2024-01-04,0.116208,1.537014,0.836505,0.558891,5.0


In [58]:
# element-wise: is each value missing?
print(df1)
print()
pd.isnull(df1)

                   A         B         C         D    E
2024-01-01  0.805956 -0.305918  0.983338 -2.441024  NaN
2024-01-02 -1.151119  0.958722  0.149249  0.621109  1.0
2024-01-03  0.068950  0.192157 -0.133623 -0.262616  1.0
2024-01-04  0.116208  1.537014  0.836505  0.558891  NaN



Unnamed: 0,A,B,C,D,E
2024-01-01,False,False,False,False,True
2024-01-02,False,False,False,False,False
2024-01-03,False,False,False,False,False
2024-01-04,False,False,False,False,True


In [59]:
# Apply an operation to each column (e.g., sum across all rows for that column)
print(df)
print()
df.apply(sum,axis=0)

                   A         B         C         D
2024-01-01  0.805956 -0.305918  0.983338 -2.441024
2024-01-02 -1.151119  0.958722  0.149249  0.621109
2024-01-03  0.068950  0.192157 -0.133623 -0.262616
2024-01-04  0.116208  1.537014  0.836505  0.558891
2024-01-05 -1.966089 -2.214610 -0.237079  0.165809
2024-01-06 -0.569517 -0.039374  0.557603 -1.189776



A   -2.695612
B    0.127991
C    2.155993
D   -2.547607
dtype: float64

In [60]:
# Apply an operation to each row (e.g., sum across all columns for that row)
df.apply(sum,axis=1)

2024-01-01   -0.957648
2024-01-02    0.577961
2024-01-03   -0.135132
2024-01-04    3.048617
2024-01-05   -4.251969
2024-01-06   -1.241064
Freq: D, dtype: float64

In [61]:
# Apply an operation element-wise
df.applymap(lambda x: x*1000)

Unnamed: 0,A,B,C,D
2024-01-01,805.955785,-305.917703,983.337807,-2441.024092
2024-01-02,-1151.118735,958.721709,149.249072,621.109095
2024-01-03,68.949763,192.157261,-133.623347,-262.61611
2024-01-04,116.207622,1537.013968,836.504737,558.891008
2024-01-05,-1966.088861,-2214.610293,-237.079096,165.809037
2024-01-06,-569.517083,-39.373977,557.603331,-1189.776297


In [62]:
df3 = df
df3['E'] = 'hello'
df3*1000

Unnamed: 0,A,B,C,D,E
2024-01-01,805.955785,-305.917703,983.337807,-2441.024092,hellohellohellohellohellohellohellohellohelloh...
2024-01-02,-1151.118735,958.721709,149.249072,621.109095,hellohellohellohellohellohellohellohellohelloh...
2024-01-03,68.949763,192.157261,-133.623347,-262.61611,hellohellohellohellohellohellohellohellohelloh...
2024-01-04,116.207622,1537.013968,836.504737,558.891008,hellohellohellohellohellohellohellohellohelloh...
2024-01-05,-1966.088861,-2214.610293,-237.079096,165.809037,hellohellohellohellohellohellohellohellohelloh...
2024-01-06,-569.517083,-39.373977,557.603331,-1189.776297,hellohellohellohellohellohellohellohellohelloh...


In [63]:
#Concat, Join, Append.

# concatenate rows together
df = pd.DataFrame(np.random.randn(10, 4))
print(df)
print()

pieces = [df[:2], df[5:7], df[8:]]
print(pd.concat(pieces))
print()

# concatenate columns together
pieces = [df.iloc[:,0:2], df.iloc[:,3]]
print(pd.concat(pieces,axis=1))
print()


          0         1         2         3
0  1.297479  0.552868  0.331574  0.303177
1  1.474777 -0.992602 -1.962705  0.421097
2  1.464510  2.063716  1.176080  0.426864
3  1.180471  0.342904  1.103002 -0.092893
4 -0.008583 -0.476841  0.486512  1.253410
5  0.333194 -0.800232 -0.042639 -0.892017
6 -1.029743 -0.763131 -0.805096 -1.353255
7  0.457657 -0.832801 -0.805734  0.523561
8 -0.077121 -0.308866  0.290847  1.348396
9 -1.358964 -1.492602  0.319448 -0.203169

          0         1         2         3
0  1.297479  0.552868  0.331574  0.303177
1  1.474777 -0.992602 -1.962705  0.421097
5  0.333194 -0.800232 -0.042639 -0.892017
6 -1.029743 -0.763131 -0.805096 -1.353255
8 -0.077121 -0.308866  0.290847  1.348396
9 -1.358964 -1.492602  0.319448 -0.203169

          0         1         3
0  1.297479  0.552868  0.303177
1  1.474777 -0.992602  0.421097
2  1.464510  2.063716  0.426864
3  1.180471  0.342904 -0.092893
4 -0.008583 -0.476841  1.253410
5  0.333194 -0.800232 -0.892017
6 -1.029743 -0.763

In [64]:
x = pd.DataFrame({'key': ['B', 'A'], 'xval': [2, 1]})
y = pd.DataFrame({'key': ['A', 'B','C'], 'yval': [4, 5,6]})
print(x)
print()
print(y)

  key  xval
0   B     2
1   A     1

  key  yval
0   A     4
1   B     5
2   C     6


In [65]:
print(y.merge(x,left_on="key",right_on="key",how="left"))
print()
print(x.merge(y,left_on="key",right_on="key",how="left"))
print()
print(x.merge(y,left_on="key",right_on="key",how="right"))

  key  yval  xval
0   A     4   1.0
1   B     5   2.0
2   C     6   NaN

  key  xval  yval
0   B     2     5
1   A     1     4

  key  xval  yval
0   A   1.0     4
1   B   2.0     5
2   C   NaN     6


In [66]:
# Append
print(y.append(x, ignore_index=True)) # deprecated
print()
print(pd.concat([y,x],ignore_index=True))

  key  yval  xval
0   A   4.0   NaN
1   B   5.0   NaN
2   C   6.0   NaN
3   B   NaN   2.0
4   A   NaN   1.0

  key  yval  xval
0   A   4.0   NaN
1   B   5.0   NaN
2   C   6.0   NaN
3   B   NaN   2.0
4   A   NaN   1.0


  print(y.append(x, ignore_index=True)) # deprecated


In [67]:
# Grouping
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],'B' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
'C' : np.random.randn(8),'D' : np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.799766,0.428196
1,bar,one,-0.336669,-1.102145
2,foo,two,0.983664,-0.486665
3,bar,three,0.367893,-0.738516
4,foo,two,-0.4706,-0.008783
5,bar,two,2.066251,-0.43596
6,foo,one,1.670782,-1.044065
7,foo,three,-1.382524,-0.163221


In [68]:
df.groupby("A").apply(lambda x: x.loc[:,"D"].sum())

A
bar   -2.276621
foo   -1.274538
dtype: float64

In [69]:
tmp=df.groupby(['A','B']).sum()
tmp

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.336669,-1.102145
bar,three,0.367893,-0.738516
bar,two,2.066251,-0.43596
foo,one,2.470548,-0.615869
foo,three,-1.382524,-0.163221
foo,two,0.513063,-0.495448


In [70]:
# Stack and unstack
print(tmp)
print()

stacked=tmp.stack(level=-1) # -1, i.e., the last column, is the default level
print(stacked)
print()

unstacked = stacked.unstack(level=-1) # -1, i.e., the last column, is the default level
print(unstacked)

                  C         D
A   B                        
bar one   -0.336669 -1.102145
    three  0.367893 -0.738516
    two    2.066251 -0.435960
foo one    2.470548 -0.615869
    three -1.382524 -0.163221
    two    0.513063 -0.495448

A    B       
bar  one    C   -0.336669
            D   -1.102145
     three  C    0.367893
            D   -0.738516
     two    C    2.066251
            D   -0.435960
foo  one    C    2.470548
            D   -0.615869
     three  C   -1.382524
            D   -0.163221
     two    C    0.513063
            D   -0.495448
dtype: float64

                  C         D
A   B                        
bar one   -0.336669 -1.102145
    three  0.367893 -0.738516
    two    2.066251 -0.435960
foo one    2.470548 -0.615869
    three -1.382524 -0.163221
    two    0.513063 -0.495448


# Sklearn

1.Preprocessing.

2.Supervised Learning.

3.Model selection.


In [71]:
# Check your version and make sure >0.18
import sklearn
sklearn.__version__

'1.3.0'

### 1. Preprocessing.

#### Scale,  Normalization, Binarization, and so on.

In [72]:
from sklearn import preprocessing

In [73]:
import numpy as np
X = np.array([[ 1., -1.,  2.],
               [ 2.,  0.,  0.],
             [ 0.,  1., -1.]])
X

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [74]:
# make each column have mean = 0 and std dev = 1
X_scaled = preprocessing.scale(X)
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [75]:
X_scaled.mean(axis=0)

array([0., 0., 0.])

In [76]:
X_scaled.std(axis=0)

array([1., 1., 1.])

In [77]:
# Equivalently, we could use:
(X-X.mean(axis=0))/X.std(axis=0)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

#### Normalize 

Normalization is the process of scaling individual samples to have unit norm. This process can be useful if you plan to use a quadratic form such as the dot-product or any other kernel to quantify the similarity of any pair of samples.

In [78]:
print(preprocessing.normalize(X))
print()

# alternatively, we could manually compute:
print(X/np.sqrt((X*X).sum(axis=1)).reshape(-1,1))

[[ 0.40824829 -0.40824829  0.81649658]
 [ 1.          0.          0.        ]
 [ 0.          0.70710678 -0.70710678]]

[[ 0.40824829 -0.40824829  0.81649658]
 [ 1.          0.          0.        ]
 [ 0.          0.70710678 -0.70710678]]


Sklearn has a ton of methods implemented, many of which we will see later in the course!

### Supervised Learning 

(Regression/Classification)

Linear Models (Ordinary Least Squares, Logistic Regression, Lasso and Ridge...)

Kernel regression

SVM

Gaussian Processes

Decision Trees and Random Forests (next class)

Naive Bayes

Supervised Neural Network models (incl. Deep Learning)

### Unsupervised Learning

Clustering.

Dimension Reduction.

Representation in Neural Networks such as RBM

### Tips: How to use packages from sklearn.

Step one: What is the problem we want to solve and what is the model we want to fit. 

Step two: What are the hyper-parameters related to model structure.

Step three: What are the inputs dataframe and what are the parameters we want to tune.

Step four: What are hyper-parameters for training process. (learning rate, iteration max...)

Step five: What are the outputs and tuned parameters.