In [None]:
import pandas as pd
import numpy as np
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')
start = datetime.now()

# 1. Numpy

## 1.1 Numpy Arrays

Numpy arrays essentially come in two flavors: vectors and matrices. Vectors are strictly 1-d arrays and matrices are 2-d (but you should note a matrix can still have only one row or one column).

In [None]:
my_matrix = [[1,2,3],[4,5,6],[7,8,9]]
print('Array\n', np.array(my_matrix))
print('\nZeros\n', np.zeros(5))
print('\nOnes\n', np.ones((3,3)))
print('\nArange\n', np.arange(0,11,2))
print('\nLinspace\n', np.linspace(0,10,10))
print('\nEye\n', np.eye(4))
print('\nRandom Rand\n', np.random.rand(2))
print('\nRandom Rand\n', np.random.rand(3,3))
print('\nRandom Rand Normal\n', np.random.randn(3,3))
print('\nRandom Rand Int\n', np.random.randint(1,100,(2,2)))
print('\nReshape\n', np.arange(9).reshape(3,3))

# max
arr_rand = np.random.randint(0,50,10)
print('\nMax\n', arr_rand.max())
print('\nArgmax\n', arr_rand.argmax())

Array
 [[1 2 3]
 [4 5 6]
 [7 8 9]]

Zeros
 [0. 0. 0. 0. 0.]

Ones
 [[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]

Arange
 [ 0  2  4  6  8 10]

Linspace
 [ 0.          1.11111111  2.22222222  3.33333333  4.44444444  5.55555556
  6.66666667  7.77777778  8.88888889 10.        ]

Eye
 [[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]

Random Rand
 [0.45901623 0.58482059]

Random Rand
 [[0.79430113 0.91239353 0.51920967]
 [0.53723593 0.96435342 0.97739791]
 [0.8136293  0.92137765 0.23479322]]

Random Rand Normal
 [[ 0.15589513 -0.29845611 -0.20512574]
 [-1.06864667 -0.68600105 -0.44838099]
 [-0.19529009 -0.44529382  0.22278329]]

Random Rand Int
 [[35 88]
 [93 74]]

Reshape
 [[0 1 2]
 [3 4 5]
 [6 7 8]]

Max
 42

Argmax
 5


## 1.2 Numpy Indexing & Seelction

In [None]:
### 1D array
# indexing
arr = np.arange(0,11)
print(arr)
print(arr[8])
print(arr[1:5])

# altering values
print(arr[0:5])
arr[0:5] = 100
print(arr[0:5])

# copy
arr = np.arange(0, 11)
print (f"Original Array: {arr}")
slice_of_arr = arr[0:6]
print (f"Slice of Array: {slice_of_arr}")
slice_of_arr[:] = 99
print (f"Altered Array: {slice_of_arr}")
print (f"Original Array: {arr}")
print ('Original array changed! Use .copy to avoid this problem ')

arr = np.arange(0,11)
print (f"Original Array: {arr}")
slice_of_arr = arr[0:6].copy()
print (f"Slice of Array: {slice_of_arr}")
slice_of_arr[:]=99
print (f"Altered Array: {slice_of_arr}")
print (f"Original Array: {arr}")
print ('Original array remains!')

[ 0  1  2  3  4  5  6  7  8  9 10]
8
[1 2 3 4]
[0 1 2 3 4]
[100 100 100 100 100]
Original Array: [ 0  1  2  3  4  5  6  7  8  9 10]
Slice of Array: [0 1 2 3 4 5]
Altered Array: [99 99 99 99 99 99]
Original Array: [99 99 99 99 99 99  6  7  8  9 10]
Original array changed! Use .copy to avoid this problem 
Original Array: [ 0  1  2  3  4  5  6  7  8  9 10]
Slice of Array: [0 1 2 3 4 5]
Altered Array: [99 99 99 99 99 99]
Original Array: [ 0  1  2  3  4  5  6  7  8  9 10]
Original array remains!


In [None]:
# 2D array
arr_2d = np.array(([5,10,15],[20,25,30],[35,40,45]))
print ("Original Array:")
print (arr_2d)
print ("2nd Row:")
print (arr_2d[1])
print ("2nd Row First Elements:")
print (arr_2d[1,0])
print ("End at 2nd row, start at 1st column:")
print (arr_2d[:2,1:])
print ("Last row all columns:")
print (arr_2d[2,:])

# fancy indexing
arr_2d = np.zeros((10,10))
arr_length = arr_2d.shape[1]
for i in range(arr_length):
    arr_2d[i] = i
print (arr_2d)
print("")
print (arr_2d[[2,4,6,8]])
print("")
print(arr_2d[[6,4,2,7]])

# selection
arr = np.arange(1,11)
print(arr)
print(arr > 4)
print(arr[arr > 4])

Original Array:
[[ 5 10 15]
 [20 25 30]
 [35 40 45]]
2nd Row:
[20 25 30]
2nd Row First Elements:
20
End at 2nd row, start at 1st column:
[[10 15]
 [25 30]]
Last row all columns:
[35 40 45]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
 [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [5. 5. 5. 5. 5. 5. 5. 5. 5. 5.]
 [6. 6. 6. 6. 6. 6. 6. 6. 6. 6.]
 [7. 7. 7. 7. 7. 7. 7. 7. 7. 7.]
 [8. 8. 8. 8. 8. 8. 8. 8. 8. 8.]
 [9. 9. 9. 9. 9. 9. 9. 9. 9. 9.]]

[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [6. 6. 6. 6. 6. 6. 6. 6. 6. 6.]
 [8. 8. 8. 8. 8. 8. 8. 8. 8. 8.]]

[[6. 6. 6. 6. 6. 6. 6. 6. 6. 6.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
 [7. 7. 7. 7. 7. 7. 7. 7. 7. 7.]]
[ 1  2  3  4  5  6  7  8  9 10]
[False False False False  True  True  True  True  True  True]
[ 5  6  7  8  9 10]


In [None]:
# numpy operation
arr = np.arange(0,5)
print(arr)
print(arr * arr)
print(arr**3)
print(np.sqrt(arr))
print(np.exp(arr))
print(np.sin(arr))
print(np.log(arr))

print(arr.max())
print(arr.sum())
print(arr.std())

print(arr / arr)
print(1 / arr)
print("0 divide by zero, nan")
print("1 divide by zero, inf")

[0 1 2 3 4]
[ 0  1  4  9 16]
[ 0  1  8 27 64]
[0.         1.         1.41421356 1.73205081 2.        ]
[ 1.          2.71828183  7.3890561  20.08553692 54.59815003]
[ 0.          0.84147098  0.90929743  0.14112001 -0.7568025 ]
[      -inf 0.         0.69314718 1.09861229 1.38629436]
4
10
1.4142135623730951
[nan  1.  1.  1.  1.]
[       inf 1.         0.5        0.33333333 0.25      ]
0 divide by zero, nan
1 divide by zero, inf


# 2 Pandas

## 2.1 Series
A Series is very similar to a NumPy array. Only one columns. 

What differentiates the NumPy array from a Series, is that a Series can have axis labels, meaning it can be indexed by a label, instead of just a number location. It also doesn't need to hold numeric data, it can hold any arbitrary Python Object.

In [None]:
labels = ['a','b','c']
my_list = [10,20,30]
d = {'a':10,'b':20,'c':30}

# can be done for list and numpy array
print(pd.Series(data=my_list,index=labels))

# can be done for dictionaries
print('\n')
print(pd.Series(d))

# series for string
print('\n')
print(pd.Series(data=labels))

# operation
ser1 = pd.Series([1,2,3,4],index = ['USA', 'Germany','USSR', 'Japan'])  
ser2 = pd.Series([1,2,5,4],index = ['USA', 'Germany','Italy', 'Japan']) 
print('\n')
print(ser1 + ser2)

a    10
b    20
c    30
dtype: int64


a    10
b    20
c    30
dtype: int64


0    a
1    b
2    c
dtype: object


Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64


## 2.2 Dataframe

In [None]:
### creating
df = pd.DataFrame(np.random.randn(3,3),index='A B C'.split(),columns='W X Y'.split())
print(df, '\n')

print(type(df['W']))
print(df['W'], '\n')

print(type(df[['W','Y']]), '\n')
print(df[['W','Y']], '\n')

# new columns
df['new'] = df['W'] + df['Y']
print(df, '\n')

# set inplace = True else it will not execute permanently
df.drop('new',axis=1,inplace=True)
print(df, '\n')

# no inplace specified so it won't change the original dataframe
# axis = 0 to drop rows
print(df.drop('C', axis=0), '\n')

# selecting row
print(df.loc['A'], '\n')

# 2nd row and all columns
print(df.iloc[2,:], '\n')

# row and columns
print(df.loc['B','Y'])
print(df.loc[['A','B'],['W','Y']])

print(df > 0)
print(df[df > 0])
print(df[df['W'] > 0])
print(df[df['W'] > 0][['Y','X']])
print(df[(df['W'] > 0) & (df['Y'] < 0)])

df.at['A', 'W'] = 20
print(df)

          W         X         Y
A -0.745107 -0.869958  2.237636
B -0.657234 -1.420785  1.467694
C -0.526107  0.942142  1.854466 

<class 'pandas.core.series.Series'>
A   -0.745107
B   -0.657234
C   -0.526107
Name: W, dtype: float64 

<class 'pandas.core.frame.DataFrame'> 

          W         Y
A -0.745107  2.237636
B -0.657234  1.467694
C -0.526107  1.854466 

          W         X         Y       new
A -0.745107 -0.869958  2.237636  1.492530
B -0.657234 -1.420785  1.467694  0.810460
C -0.526107  0.942142  1.854466  1.328359 

          W         X         Y
A -0.745107 -0.869958  2.237636
B -0.657234 -1.420785  1.467694
C -0.526107  0.942142  1.854466 

          W         X         Y
A -0.745107 -0.869958  2.237636
B -0.657234 -1.420785  1.467694 

W   -0.745107
X   -0.869958
Y    2.237636
Name: A, dtype: float64 

W   -0.526107
X    0.942142
Y    1.854466
Name: C, dtype: float64 

1.4676936213959868
          W         Y
A -0.745107  2.237636
B -0.657234  1.467694
       W      X  

In [None]:
# index
print(df.reset_index())
df['States'] = 'CA NY WY'.split()
print(df)
df.set_index('States',inplace=True)
print(df)

# multi index
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
print(hier_index)
hier_index = pd.MultiIndex.from_tuples(hier_index)
print(hier_index)

df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
print(df)
print(df.loc['G1'])
print(df.loc['G1'].loc[1])
df.index.names = ['Group','Num']
print(df)
print(df.loc['G1'])
print(df.xs(1,level='Num'))

  index          W         X         Y States
0     A  20.000000 -0.869958  2.237636     CA
1     B  -0.657234 -1.420785  1.467694     NY
2     C  -0.526107  0.942142  1.854466     WY
           W         X         Y States
A  20.000000 -0.869958  2.237636     CA
B  -0.657234 -1.420785  1.467694     NY
C  -0.526107  0.942142  1.854466     WY
                W         X         Y
States                               
CA      20.000000 -0.869958  2.237636
NY      -0.657234 -1.420785  1.467694
WY      -0.526107  0.942142  1.854466
[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]
MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )
             A         B
G1 1  0.178439 -0.202989
   2  1.696370  0.904362
   3  0.440715  0.650314
G2 1  0.484900  1.253204
   2 -0.644334 -0.429243
   3  0.306680  1.843164
          A         B
1  0.178439 -0.202989
2  1.696370  0.904362
3  0.4

## 2.3 Missing Data

In [None]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})
print(df)
print(df.dropna(axis=0))
print(df.dropna(axis=1))

# need to ahve at least 2 na
print(df.dropna(thresh=2))

# replace value
print(df.fillna(value='FILL VALUE'))

     A    B  C
0  1.0  5.0  1
1  2.0  NaN  2
2  NaN  NaN  3
     A    B  C
0  1.0  5.0  1
   C
0  1
1  2
2  3
     A    B  C
0  1.0  5.0  1
1  2.0  NaN  2
            A           B  C
0         1.0         5.0  1
1         2.0  FILL VALUE  2
2  FILL VALUE  FILL VALUE  3


## 2.4 Groupby

In [None]:
# Create dataframe
df = pd.DataFrame({'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]})
print(df)
print(df.groupby('Company').last())
print(df.groupby('Company').mean())
print(df.groupby('Company').describe())
print(df.groupby('Company').describe().transpose())
print(df.groupby('Company').describe().transpose()['GOOG'])

  Company   Person  Sales
0    GOOG      Sam    200
1    GOOG  Charlie    120
2    MSFT      Amy    340
3    MSFT  Vanessa    124
4      FB     Carl    243
5      FB    Sarah    350
          Person  Sales
Company                
FB         Sarah    350
GOOG     Charlie    120
MSFT     Vanessa    124
         Sales
Company       
FB       296.5
GOOG     160.0
MSFT     232.0
        Sales                                                        
        count   mean         std    min     25%    50%     75%    max
Company                                                              
FB        2.0  296.5   75.660426  243.0  269.75  296.5  323.25  350.0
GOOG      2.0  160.0   56.568542  120.0  140.00  160.0  180.00  200.0
MSFT      2.0  232.0  152.735065  124.0  178.00  232.0  286.00  340.0
Company              FB        GOOG        MSFT
Sales count    2.000000    2.000000    2.000000
      mean   296.500000  160.000000  232.000000
      std     75.660426   56.568542  152.735065
      min  

## 2.5 DataFrame Combination

In [None]:
# concating
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7]) 

df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])

print(pd.concat([df1,df2,df3]), '\n')
print(pd.concat([df1,df2,df3],axis=1))

      A    B    C    D
0    A0   B0   C0   D0
1    A1   B1   C1   D1
2    A2   B2   C2   D2
3    A3   B3   C3   D3
4    A4   B4   C4   D4
5    A5   B5   C5   D5
6    A6   B6   C6   D6
7    A7   B7   C7   D7
8    A8   B8   C8   D8
9    A9   B9   C9   D9
10  A10  B10  C10  D10
11  A11  B11  C11  D11 

      A    B    C    D    A    B    C    D    A    B    C    D
0    A0   B0   C0   D0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
1    A1   B1   C1   D1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
2    A2   B2   C2   D2  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
3    A3   B3   C3   D3  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
4   NaN  NaN  NaN  NaN   A4   B4   C4   D4  NaN  NaN  NaN  NaN
5   NaN  NaN  NaN  NaN   A5   B5   C5   D5  NaN  NaN  NaN  NaN
6   NaN  NaN  NaN  NaN   A6   B6   C6   D6  NaN  NaN  NaN  NaN
7   NaN  NaN  NaN  NaN   A7   B7   C7   D7  NaN  NaN  NaN  NaN
8   NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN   A8   B8   C8   D8
9   NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN   A9   B9   C9   D9
10  Na

In [None]:
### merging 

# merge based on key
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})  
print(pd.merge(left,right,how='inner',on='key'), '\n')

# merge based on 2 keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                      'key2': ['K0', 'K0', 'K0', 'K0'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']})

print(pd.merge(left, right, on=['key1', 'key2']), '\n')

# outer will include everything
print(pd.merge(left, right, how='outer', on=['key1', 'key2']), '\n')

# include everything on the right
print(pd.merge(left, right, how='right', on=['key1', 'key2']), '\n')

# include everything on the left
print(pd.merge(left, right, how='left', on=['key1', 'key2']), '\n')

  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3 

  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2 

  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K1   A3   B3  NaN  NaN
5   K2   K0  NaN  NaN   C3   D3 

  key1 key2    A    B   C   D
0   K0   K0   A0   B0  C0  D0
1   K1   K0   A2   B2  C1  D1
2   K1   K0   A2   B2  C2  D2
3   K2   K0  NaN  NaN  C3  D3 

  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN 



In [None]:
### joining
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])

print(left.join(right), '\n')
print(left.join(right, how='outer'), '\n')

     A   B    C    D
K0  A0  B0   C0   D0
K1  A1  B1  NaN  NaN
K2  A2  B2   C2   D2 

      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C2   D2
K3  NaN  NaN   C3   D3 



## 2.6 DataFrame Operations

In [None]:
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
print(df.head())
print(df.columns)
print(df.index)
print(df['col2'].unique())
print(df['col2'].nunique())
print(df['col2'].value_counts())
print(df.nunique())

   col1  col2 col3
0     1   444  abc
1     2   555  def
2     3   666  ghi
3     4   444  xyz
Index(['col1', 'col2', 'col3'], dtype='object')
RangeIndex(start=0, stop=4, step=1)
[444 555 666]
3
444    2
555    1
666    1
Name: col2, dtype: int64
col1    4
col2    3
col3    4
dtype: int64


In [None]:
### function
def times2(x):
    return x*2

print(df['col1'].apply(times2))
print(df['col1'].apply(lambda x : x * 2))

# permanently delete columns
del df['col1']
print(df)

# sorting, inplace=False by default
print(df.sort_values(by='col2'))

# check null values
print(df.isnull())

# pivot table
df = pd.DataFrame({
    'A':['foo','foo','foo','bar','bar','bar'],
    'B':['one','one','two','two','one','one'],
    'C':['x','y','x','y','x','y'],
    'D':[1,3,2,5,4,1]})
print(df)
print(df.pivot_table(values='D',index=['A', 'B'],columns=['C']))

0    2
1    4
2    6
3    8
Name: col1, dtype: int64
0    2
1    4
2    6
3    8
Name: col1, dtype: int64
   col2 col3
0   444  abc
1   555  def
2   666  ghi
3   444  xyz
   col2 col3
0   444  abc
3   444  xyz
1   555  def
2   666  ghi
    col2   col3
0  False  False
1  False  False
2  False  False
3  False  False
     A    B  C  D
0  foo  one  x  1
1  foo  one  y  3
2  foo  two  x  2
3  bar  two  y  5
4  bar  one  x  4
5  bar  one  y  1
C          x    y
A   B            
bar one  4.0  1.0
    two  NaN  5.0
foo one  1.0  3.0
    two  2.0  NaN
