In [102]:
import numpy as np

# Numpy arrays

## Creating ndarrays

In [103]:
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr = np.array(data2)
arr

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [104]:
arr.shape

(2, 4)

In [105]:
arr.ndim

2

In [106]:
arr.dtype

dtype('int32')

In [107]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [108]:
np.empty((2,3))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [109]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [110]:
arr1 = np.array([1.2, -5.3, 7.2])                             #.astype()

In [111]:
arr1 = arr1.astype('int64')

In [112]:
arr1

array([ 1, -5,  7], dtype=int64)

## Indexing and slicing

In [113]:
arr = np.arange(10)

In [114]:
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [115]:
arr[5:8] = 10

In [116]:
arr

array([ 0,  1,  2,  3,  4, 10, 10, 10,  8,  9])

#### Mutations in slice are reflected automatically in the original array

In [117]:
arr_slice = arr[5:8]

In [118]:
arr_slice[:] = 11

In [119]:
arr_slice

array([11, 11, 11])

In [120]:
arr

array([ 0,  1,  2,  3,  4, 11, 11, 11,  8,  9])

### Boolean indexing

In [121]:
names = np.array(['bob','mark','steve','bob','mark','bob'])

In [122]:
data = np.random.randn(6, 4)
data

array([[ 0.90159072,  0.50249434,  0.90085595, -0.68372786],
       [-0.12289023, -0.93576943, -0.26788808,  0.53035547],
       [-0.69166075, -0.39675353, -0.6871727 , -0.84520564],
       [-0.67124613, -0.0126646 , -1.11731035,  0.2344157 ],
       [ 1.65980218,  0.74204416, -0.19183555, -0.88762896],
       [-0.74715829,  1.6924546 ,  0.05080775, -0.63699565]])

In [123]:
data[names == 'bob']

array([[ 0.90159072,  0.50249434,  0.90085595, -0.68372786],
       [-0.67124613, -0.0126646 , -1.11731035,  0.2344157 ],
       [-0.74715829,  1.6924546 ,  0.05080775, -0.63699565]])

In [124]:
#inversion
data[~(names == 'steve')]

array([[ 0.90159072,  0.50249434,  0.90085595, -0.68372786],
       [-0.12289023, -0.93576943, -0.26788808,  0.53035547],
       [-0.67124613, -0.0126646 , -1.11731035,  0.2344157 ],
       [ 1.65980218,  0.74204416, -0.19183555, -0.88762896],
       [-0.74715829,  1.6924546 ,  0.05080775, -0.63699565]])

In [125]:
mask = (names == 'mark')|(names == 'bob')
data[mask]

array([[ 0.90159072,  0.50249434,  0.90085595, -0.68372786],
       [-0.12289023, -0.93576943, -0.26788808,  0.53035547],
       [-0.67124613, -0.0126646 , -1.11731035,  0.2344157 ],
       [ 1.65980218,  0.74204416, -0.19183555, -0.88762896],
       [-0.74715829,  1.6924546 ,  0.05080775, -0.63699565]])

### Fancy indexing

In [126]:
arr = np.arange(20).reshape(4,5)
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

In [127]:
#couples indexes and forms tuples
arr[[0,2,1],[3,1,0]]

array([ 3, 11,  5])

In [128]:
#two-dimensional array with chosen rows and columns
arr[[0,1,2]][:,[2,1]]

array([[ 2,  1],
       [ 7,  6],
       [12, 11]])

In [129]:
 ?np.transpose ################################################

Object `np.transpose ################################################` not found.


## Universal functions

In [130]:
arr1 = np.random.randn(10)
arr2 = np.random.randn(10)

In [131]:
arr1

array([ 0.19091548,  2.10025514,  0.12015895,  0.61720311,  0.30017032,
       -0.35224985, -1.1425182 , -0.34934272, -0.20889423,  0.58662319])

In [132]:
arr2

array([ 0.83898341,  0.93110208,  0.28558733,  0.88514116, -0.75439794,
        1.25286816,  0.51292982, -0.29809284,  0.48851815, -0.07557171])

In [133]:
np.maximum(arr1,arr2)                                                 #.maximum()

array([ 0.83898341,  2.10025514,  0.28558733,  0.88514116,  0.30017032,
        1.25286816,  0.51292982, -0.29809284,  0.48851815,  0.58662319])

In [134]:
arr3 = np.random.randn(7)*6
arr3

array([ 6.78977632,  9.1189009 , 13.11345244, -8.37897801, -8.66468283,
       -3.02679518,  0.96022242])

In [135]:
remainder, whole_part = np.modf(arr3)                                  #.modf()

In [136]:
remainder

array([ 0.78977632,  0.1189009 ,  0.11345244, -0.37897801, -0.66468283,
       -0.02679518,  0.96022242])

In [137]:
whole_part

array([ 6.,  9., 13., -8., -8., -3.,  0.])

In [138]:
#operating ufuncs in-place, need to provide 'out' argument
np.sqrt(arr3, arr3)                                                   

  np.sqrt(arr3, arr3)


array([2.60571992, 3.01975179, 3.62125012,        nan,        nan,
              nan, 0.97990939])

#### Unary and binary universal functions: Python for Data Analysis, page 107-108

In [139]:
arr = np.random.randn(8)*4

In [140]:
remainder, whole = np.modf(arr)

In [141]:
cond = remainder >0.3

In [142]:
np.where(cond, 1, -1)                                         #.where()

array([ 1, -1, -1, -1,  1,  1, -1, -1])

#### Array set operations: Python for Data Analysis, page 115

#### Commonly used numpy.linalg functions: Python for Data Analysis, page 117

## Pseudorandom Number Generation 

In [143]:
np.random.seed(1)
#this is global random seed and every np.random function will use it

In [144]:
#to avoid it we can use random number generator, isolated from others 
rng = np.random.RandomState(1234)                        #.RandomState()

In [145]:
np.random.randn(10)

array([ 1.62434536, -0.61175641, -0.52817175, -1.07296862,  0.86540763,
       -2.3015387 ,  1.74481176, -0.7612069 ,  0.3190391 , -0.24937038])

In [146]:
rng.randn(10)

array([ 0.47143516, -1.19097569,  1.43270697, -0.3126519 , -0.72058873,
        0.88716294,  0.85958841, -0.6365235 ,  0.01569637, -2.24268495])

#### np.random functions: Python for Data Analysis, page 119

# Pandas

In [147]:
import pandas as pd

## Series

In [148]:
obj1 = pd.Series([4, -2, 9, 10.5], index=['a','b','c','d'])
obj1

a     4.0
b    -2.0
c     9.0
d    10.5
dtype: float64

In [149]:
obj1.values

array([ 4. , -2. ,  9. , 10.5])

In [150]:
obj1.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [151]:
#another way is to create a dict and convert it in Series

In [152]:
dict_data = {'ty': 10, 'cod': 9, 'cob': 8}
obj2 = pd.Series(dict_data)
obj2

ty     10
cod     9
cob     8
dtype: int64

In [153]:
names = ['ty', 'cob', 'alex']
obj3 = pd.Series(dict_data, index=names)
obj3

ty      10.0
cob      8.0
alex     NaN
dtype: float64

In [154]:
pd.isnull(obj3)                          # pd.isnull()

ty      False
cob     False
alex     True
dtype: bool

In [155]:
pd.notnull(obj3)                         # pd.notnull()

ty       True
cob      True
alex    False
dtype: bool

In [156]:
obj3.name = 'scores'                     # название серии и колонки индексов
obj3.index.name = 'name'
obj3

name
ty      10.0
cob      8.0
alex     NaN
Name: scores, dtype: float64

In [157]:
obj4 = pd.Series(np.arange(5)+2)
obj4

0    2
1    3
2    4
3    5
4    6
dtype: int32

In [158]:
obj4.index = ['z','x','c','v','b']
obj4

z    2
x    3
c    4
v    5
b    6
dtype: int32

## Dataframe

In [159]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                    index=['one', 'two', 'three', 'four', 'five', 'six'])

In [160]:
frame.debt = np.arange(6)

In [161]:
frame['eastern'] = frame.state == 'Ohio'
frame

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,0,True
two,2001,Ohio,1.7,1,True
three,2002,Ohio,3.6,2,True
four,2001,Nevada,2.4,3,False
five,2002,Nevada,2.9,4,False
six,2003,Nevada,3.2,5,False


In [162]:
frame[frame.eastern]

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,0,True
two,2001,Ohio,1.7,1,True
three,2002,Ohio,3.6,2,True


In [163]:
# del removes column
del frame['eastern']

In [164]:
frame.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

### Nested dict of dicts

In [165]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [166]:
frame2 = pd.DataFrame(pop)
frame2

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [167]:
frame2.T                                         # .T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


## Index objects

In [168]:
labels = pd.Index(np.arange(3))

In [169]:
obj5 = pd.Series([1.5, -2.5, 0], index=labels)
obj5

0    1.5
1   -2.5
2    0.0
dtype: float64

#### Some Index methods and properties: Python for Data Analysis, page 136

### Reindexing

In [170]:
obj6 = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj6

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [171]:
obj6 = obj6.reindex(['a', 'b', 'c', 'd', 'e'])
obj6

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [172]:
obj7 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

In [173]:
# .reindex() function has argument 'method'
# method 'ffill' forward-fills the values                                  !!!
obj7.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [174]:
frame3 = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame3

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [175]:
frame4 = frame3.reindex(['a', 'b', 'c', 'd'])
frame4

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [176]:
states = ['Texas', 'Utah', 'California']
frame4 = frame3.reindex(columns=states)
frame4

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [177]:
# An alternative is loc attribute, but it reindexes only existing labels
frame4.loc[['a','c','d'], states]

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


#### Reindex function arguments: Python for Data Analysis, page 138

In [178]:
# dataframe.drop() - drops columns or rows, might do it in-place(inplace = True)

## Data alignment

### fill value

In [179]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df1.loc[2, 'c'] = np.nan

In [180]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,,11.0


In [181]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [182]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,,24.0,
3,,,,,


In [183]:
df1.add(df2, fill_value=0)                 # df1.add(df2)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,12.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


## Function application and mapping

In [184]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [185]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)                                                    #df.apply(func, axis = ...)

Unnamed: 0,b,d,e
min,-0.384054,-2.060141,-1.099891
max,1.462108,1.133769,1.144724


In [186]:
format = lambda x: '%.2f' % x
frame.applymap(format)                                            #df.applymap(func) (for Series only .map())

Unnamed: 0,b,d,e
Utah,1.46,-2.06,-0.32
Ohio,-0.38,1.13,-1.1
Texas,-0.17,-0.88,0.04
Oregon,0.58,-1.1,1.14


## Sorting and ranking

In [187]:
frame_sort = pd.DataFrame(np.arange(8).reshape((2, 4)),
                          index=['three', 'one'],
                          columns=['d', 'a', 'b', 'c'])
frame_sort

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [188]:
frame_sort.sort_index()                                           #df.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [189]:
frame_sort.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [190]:
frame_val = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}).sort_index(axis=1)

In [191]:
frame_val.sort_values(by = ['a','b'])                             #df.sort_values(by = ...)

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


## Summarizing and Computing Descriptive Statistics

In [192]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])

In [193]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [194]:
df.sum(axis = 1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [195]:
df.sum(axis = 1, skipna=False)                          #skipna = ...

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [196]:
df.idxmax()                       #df.idxmax(): returns the index value where the minimum 
                                  #or maximum values are attained

one    b
two    d
dtype: object

In [197]:
df.cumsum(skipna=False)

Unnamed: 0,one,two
a,1.4,
b,8.5,
c,,
d,,


In [198]:
df.describe()                                       # df.describe(): produces multiple summary statistics in one shot

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


#### Descriptive and summary statistics: Python for Data Analysis, page 160

In [199]:
import pandas_datareader.data as web

In [200]:
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = pd.DataFrame({ticker: data['Adj Close']
                      for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                       for ticker, data in all_data.items()})

RemoteDataError: Unable to read URL: https://finance.yahoo.com/quote/AAPL/history?period1=1470272400&period2=1628038799&interval=1d&frequency=1d&filter=history
Response Text:
b'<!DOCTYPE html>\n  <html lang="en-us"><head>\n  <meta http-equiv="content-type" content="text/html; charset=UTF-8">\n      <meta charset="utf-8">\n      <title>Yahoo</title>\n      <meta name="viewport" content="width=device-width,initial-scale=1,minimal-ui">\n      <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n      <style>\n  html {\n      height: 100%;\n  }\n  body {\n      background: #fafafc url(https://s.yimg.com/nn/img/sad-panda-201402200631.png) 50% 50%;\n      background-size: cover;\n      height: 100%;\n      text-align: center;\n      font: 300 18px "helvetica neue", helvetica, verdana, tahoma, arial, sans-serif;\n  }\n  table {\n      height: 100%;\n      width: 100%;\n      table-layout: fixed;\n      border-collapse: collapse;\n      border-spacing: 0;\n      border: none;\n  }\n  h1 {\n      font-size: 42px;\n      font-weight: 400;\n      color: #400090;\n  }\n  p {\n      color: #1A1A1A;\n  }\n  #message-1 {\n      font-weight: bold;\n      margin: 0;\n  }\n  #message-2 {\n      display: inline-block;\n      *display: inline;\n      zoom: 1;\n      max-width: 17em;\n      _width: 17em;\n  }\n      </style>\n  <script>\n    document.write(\'<img src="//geo.yahoo.com/b?s=1197757129&t=\'+new Date().getTime()+\'&src=aws&err_url=\'+encodeURIComponent(document.URL)+\'&err=%<pssc>&test=\'+encodeURIComponent(\'%<{Bucket}cqh[:200]>\')+\'" width="0px" height="0px"/>\');var beacon = new Image();beacon.src="//bcn.fp.yahoo.com/p?s=1197757129&t="+new Date().getTime()+"&src=aws&err_url="+encodeURIComponent(document.URL)+"&err=%<pssc>&test="+encodeURIComponent(\'%<{Bucket}cqh[:200]>\');\n  </script>\n  </head>\n  <body>\n  <!-- status code : 404 -->\n  <!-- Not Found on Server -->\n  <table>\n  <tbody><tr>\n      <td>\n      <img src="https://s.yimg.com/rz/p/yahoo_frontpage_en-US_s_f_p_205x58_frontpage.png" alt="Yahoo Logo">\n      <h1 style="margin-top:20px;">Will be right back...</h1>\n      <p id="message-1">Thank you for your patience.</p>\n      <p id="message-2">Our engineers are working quickly to resolve the issue.</p>\n      </td>\n  </tr>\n  </tbody></table>\n  </body></html>'

In [None]:
returns = price.pct_change()
returns.tail()

In [None]:
returns.corr()                                                 # df.corr()

In [None]:
returns.cov()                                                   # df.cov()

In [None]:
returns.corrwith(returns.MSFT)                                 # df.corrwith(series)

## Unique Values, Value Counts, and Membership

In [203]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])

In [204]:
unique_vals = pd.Series(['c', 'b', 'a'])

In [205]:
pd.Index(unique_vals).get_indexer(to_match)                     # pd.Index().get_indexer()

array([0, 2, 1, 1, 0, 2], dtype=int64)

#### Unique, value counts, and set membership methods: Python for Data Analysis, page 164

In [201]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})

In [202]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [206]:
data.value_counts()                                # shit

Qu1  Qu2  Qu3
4    3    4      1
     1    2      1
3    3    5      1
     2    4      1
1    2    1      1
dtype: int64

In [207]:
result = data.apply(pd.value_counts).fillna(0)      #pd.value_counts()

In [208]:
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
