In [16]:
import sys
print(sys.version)
assert sys.version_info >= (3, 6)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


3.7.3 (default, Mar 27 2019, 16:54:48) 
[Clang 4.0.1 (tags/RELEASE_401/final)]


## Data Structures in Numpy 

* Numpy is a package that provides
    - A N-dimensional array object (tensors)
    - Indexing and slicing
    - Vectorized operations
    - Broadcasting
    - Linear Algebra functions
    - Random number generation functions
    - Descriptive statistics functions
    
Numpy reference: https://docs.scipy.org/doc/numpy-1.13.0/reference/index.html

### Numpy N-dimensionsal Arrays (tensors)

Numpy basics: https://docs.scipy.org/doc/numpy-1.13.0/user/basics.creation.html

* A tensor is defined by 3 attributes
    - ndim: Number of axes (dimensionality of tensor) (often called the rank of the tensor)
    - shape: tuple of integers containing number of dimensions along each axis
    - dtype: the type of data contained in the tensor
    
Note:  Dimensionality can denote either the number of entries along a specific axis or the number of axes in a tensor(i.e. the number of direction required to describe it)

In [17]:
def pr_attr(v):
    print(f"Class: {type(v)}, \nNumber of axes: {v.ndim}, \nShape: {v.shape}, \nData type: {v.dtype}")

#### Creating N-dimensionsal Arrays

In [18]:
# Scalars (0 dimensional tensors)
np.ndim(5),np.shape(5)

(0, ())

In [19]:
# Vectors (1 dimensional tensor)

## Create vector from a list ot tuple
v = np.array([1,2,3]) 
print(v)

pr_attr(v)


[1 2 3]
Class: <class 'numpy.ndarray'>, 
Number of axes: 1, 
Shape: (3,), 
Data type: int64


In [20]:
w = np.array((1,2,3,4));w

array([1, 2, 3, 4])

In [21]:
## Built-in Methods
z = np.zeros(10)
o = np.ones(10)
print(z)
print(o)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [22]:
# Sequences

r = np.arange(2,10,2)
x = np.linspace(0,10,20)
r,x

(array([2, 4, 6, 8]),
 array([ 0.        ,  0.52631579,  1.05263158,  1.57894737,  2.10526316,
         2.63157895,  3.15789474,  3.68421053,  4.21052632,  4.73684211,
         5.26315789,  5.78947368,  6.31578947,  6.84210526,  7.36842105,
         7.89473684,  8.42105263,  8.94736842,  9.47368421, 10.        ]))

In [23]:
# Difference between vector and 2 dimensional array with 1- column
v=np.array([1,2,3])
print("v\n",v)
pr_attr(v)
w = v.reshape(3,1) # 3 rows, 1 column
print("w\n",w)
pr_attr(w)

v
 [1 2 3]
Class: <class 'numpy.ndarray'>, 
Number of axes: 1, 
Shape: (3,), 
Data type: int64
w
 [[1]
 [2]
 [3]]
Class: <class 'numpy.ndarray'>, 
Number of axes: 2, 
Shape: (3, 1), 
Data type: int64


In [24]:
# Matrices (2 dimensional tensor)

v = np.arange(10)
M1 = v.reshape(2,5)  # reshape 1-d array
print(M1)
M2 = np.array([[1,2,3],[4,5,6]]) # List of lists
M2

[[0 1 2 3 4]
 [5 6 7 8 9]]


array([[1, 2, 3],
       [4, 5, 6]])

In [25]:
pr_attr(M2)


Class: <class 'numpy.ndarray'>, 
Number of axes: 2, 
Shape: (2, 3), 
Data type: int64


In [26]:
# Idendity Matrix
I = np.eye(4);I

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [27]:
# 3 dimensionsal tensors

T3 = np.array([[[1,2,3,4],   # Stacked 2-d tensor (3 leading [[[)
               [5,6,7,8]],
              [[4,3,2,1],
               [8,7,6,5]],
              [[10,11,12,13],
               [14,15,16,17]]])
print("T3\n",T3)
pr_attr(T3)



T3
 [[[ 1  2  3  4]
  [ 5  6  7  8]]

 [[ 4  3  2  1]
  [ 8  7  6  5]]

 [[10 11 12 13]
  [14 15 16 17]]]
Class: <class 'numpy.ndarray'>, 
Number of axes: 3, 
Shape: (3, 2, 4), 
Data type: int64


#### Indexing

In [28]:
# Bracket Indexing

print(f'v \n{v} \n M1 \n{M1} \n T3 \n{T3}')

print(v[2]) # vector 1 axes
print(M1[1,3]) # Matrix 2 axes (row, col)
print(T3[2,1,2]) # 3-dimensional 3 axes

v 
[0 1 2 3 4 5 6 7 8 9] 
 M1 
[[0 1 2 3 4]
 [5 6 7 8 9]] 
 T3 
[[[ 1  2  3  4]
  [ 5  6  7  8]]

 [[ 4  3  2  1]
  [ 8  7  6  5]]

 [[10 11 12 13]
  [14 15 16 17]]]
2
8
16


In [29]:
M1[1][3]

8

In [30]:
# : is the slice operator (from:to+1:step)

print(v, v[2:6],v[2:8:2]) 

[0 1 2 3 4 5 6 7 8 9] [2 3 4 5] [2 4 6]


In [31]:
v[2:9:2]

array([2, 4, 6, 8])

In [34]:
print(v>2)

[False False False  True  True  True  True  True  True  True]


In [32]:
print(M1)
M1[:,2:4] # for all rows

[[0 1 2 3 4]
 [5 6 7 8 9]]


array([[2, 3],
       [7, 8]])

In [33]:
# Logical Indexing
print("v\n",v)
print(v[v>2],v[v>3])

v
 [0 1 2 3 4 5 6 7 8 9]
[3 4 5 6 7 8 9] [4 5 6 7 8 9]


In [35]:
# Reshape as a function
v = np.arange(16)
M3 = np.reshape(v,(4,4)) 
print(M3)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]


#### ndarray operations

* Vectorized operations: operators applied to whole arrays (elementwise)
    - no loops

In [37]:
a = np.arange(10).reshape(2,5)
print("a\n",a)
(np.sum(a[0,:]),np.prod(a[1,:]),np.sum(a))

a
 [[0 1 2 3 4]
 [5 6 7 8 9]]


(10, 15120, 45)

In [38]:
(np.sqrt(a),np.exp(a))

(array([[0.        , 1.        , 1.41421356, 1.73205081, 2.        ],
        [2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ]]),
 array([[1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
         5.45981500e+01],
        [1.48413159e+02, 4.03428793e+02, 1.09663316e+03, 2.98095799e+03,
         8.10308393e+03]]))

In [39]:
M = np.exp(a)
print(M)
np.round(M,decimals=2)

[[1.00000000e+00 2.71828183e+00 7.38905610e+00 2.00855369e+01
  5.45981500e+01]
 [1.48413159e+02 4.03428793e+02 1.09663316e+03 2.98095799e+03
  8.10308393e+03]]


array([[1.00000e+00, 2.72000e+00, 7.39000e+00, 2.00900e+01, 5.46000e+01],
       [1.48410e+02, 4.03430e+02, 1.09663e+03, 2.98096e+03, 8.10308e+03]])

In [41]:
arr = np.array([[0.14, 0.18], [0.20, 0.27]])
np.around(arr, decimals=1)

array([[0.1, 0.2],
       [0.2, 0.3]])

#### Random Methods

https://docs.scipy.org/doc/numpy-1.14.0/reference/routines.random.html


#### # Vector of random floats in half open interval[0, 1)

In [42]:
# Vector of random floats in half open interval[0, 1)
v = np.random.random(5) # size = 10
v2 = np.random.sample(5)
print("v\n",v)
print("v2\n",v2)

v
 [0.02687579 0.5965505  0.36985707 0.25336577 0.47360723]
v2
 [0.12167852 0.70625028 0.5562863  0.01114335 0.70510703]


#### Reproducible random number generation

In [50]:
# Random Seed
np.random.seed(1234) # some non-negative number

v1 = np.random.random(3)
np.random.seed(1234)
v2 = np.random.random(3)
print("v1\n",v1)
print("v2\n",v2)


v1
 [0.19151945 0.62210877 0.43772774]
v2
 [0.19151945 0.62210877 0.43772774]


#### Vector of random integers 

In [52]:
v = np.random.randint(10,20,5) # from,to (exclusive),size
v

array([16, 18, 10, 15, 10])

#### Random sampling

In [53]:
# 20 rolls of a fair 6-sided dice
rolls = np.random.choice(np.arange(1,7),size = 20)
print(rolls)
rolls = np.random.choice(np.arange(1,7),size = 20,p=(1/2,0,0,1/2,0,0))
print(rolls)

[2 3 1 4 5 6 3 3 4 4 1 2 4 1 4 3 4 5 2 4]
[1 4 1 4 4 1 4 1 4 4 1 4 1 4 1 1 1 4 4 4]


#### Permute a sequence

In [54]:
seq_permuted = np.random.permutation([1,2,3,4]);seq_permuted

array([3, 1, 2, 4])

In [55]:
# Create an array of the given shape and fill with random samples 
r1 = np.random.rand(10) # in half open interval[0, 1) (i.e. Uniform distribution)
r2 = np.random.randn(2,5) # From Normal 
print(r1)
print(r2)
print(r1.shape,r2.shape)

[0.43015136 0.76453077 0.59973081 0.08094696 0.70454447 0.16401332
 0.03234935 0.32815036 0.47386    0.06808472]
[[-0.90772784 -0.27911369  0.24412027  0.36605638 -0.27100124]
 [-0.07120714  0.08483683  0.12041429 -1.26864685  0.52483734]]
(10,) (2, 5)


#### Broadcasting

https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html

* How numpy treats arrays of different shapes
* In general, numpy broadcasts the short array to match the larger array
* For an operation on two arrays, their shapes are compared element-wise (starting with the trailing dimensions)
    - Two dimensions are compatible when
        - they are equal
        - one of them is a one

In [56]:
a = np.arange(10)
c = 3.0
print(a)
print(a*c)

[0 1 2 3 4 5 6 7 8 9]
[ 0.  3.  6.  9. 12. 15. 18. 21. 24. 27.]


In [57]:
b = np.arange(5)
print(b)
print((a + b))

[0 1 2 3 4]


ValueError: operands could not be broadcast together with shapes (10,) (5,) 

In [58]:
b = b.reshape(5,1)
print(a + b)

[[ 0  1  2  3  4  5  6  7  8  9]
 [ 1  2  3  4  5  6  7  8  9 10]
 [ 2  3  4  5  6  7  8  9 10 11]
 [ 3  4  5  6  7  8  9 10 11 12]
 [ 4  5  6  7  8  9 10 11 12 13]]


In [59]:
a = a.reshape(-1,1)
print(a)
print(a.shape,b.shape)
print(a + b)

[[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]
 [9]]
(10, 1) (5, 1)


ValueError: operands could not be broadcast together with shapes (10,1) (5,1) 

In [60]:
a=a.reshape((10,))
print(a + b)

[[ 0  1  2  3  4  5  6  7  8  9]
 [ 1  2  3  4  5  6  7  8  9 10]
 [ 2  3  4  5  6  7  8  9 10 11]
 [ 3  4  5  6  7  8  9 10 11 12]
 [ 4  5  6  7  8  9 10 11 12 13]]


### Compare floating point numbers

In [61]:
x = .0000000000001
print(x == 0)
np.isclose(x,0)

False


True

### Data Structures in Pandas

* Pandas is a package that provides:
    - Data Frames: data structures designed for data analysis
    - Data Input/Output functions
    
Quick look at Pandas: https://pandas.pydata.org/pandas-docs/stable/10min.html


#### Pandas Series

* A NumPy array (vector) and an index that labels each element in the vector.

In [62]:
vals = pd.Series([2,3,5,7,9,11])
print(type(vals))
vals

<class 'pandas.core.series.Series'>


0     2
1     3
2     5
3     7
4     9
5    11
dtype: int64

#### From series to array

In [63]:
vals.values

array([ 2,  3,  5,  7,  9, 11])

In [64]:
list(vals.index)

[0, 1, 2, 3, 4, 5]

In [65]:
type(vals.index)

pandas.core.indexes.range.RangeIndex

In [66]:
# Indexing
cnts = pd.Series((1,2,3,4),index=('a','b','c','d'))
print(cnts)
cnts['a'],cnts[1]

a    1
b    2
c    3
d    4
dtype: int64


(1, 2)

In [67]:
cnts[['b','c']]

b    2
c    3
dtype: int64

In [68]:
cnts['b':'d']

b    2
c    3
d    4
dtype: int64

### Data Frames
 
* The main data structure for data analysis
* Columns are variables, rows are observations
* Number of rows in each column must be the same
* Columns can be different data types
* Extensive set of methods
* Typically created by reading in a dataset
* Some packages have built-in datasets
* Three types of indexing
    - []  
    - .loc  
    - .iloc

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html

In [69]:
d = {'col1': [1,2,3,4,5], 'col2': ["a","b","c","d","e"],"col3":[True]*2 + [False]*3}
df = pd.DataFrame(d)
print(type(df))
df.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,col1,col2,col3
0,1,a,True
1,2,b,True
2,3,c,False
3,4,d,False
4,5,e,False


In [70]:
df.head(2)

Unnamed: 0,col1,col2,col3
0,1,a,True
1,2,b,True


In [71]:
print(type(df.col1))
df.col1

<class 'pandas.core.series.Series'>


0    1
1    2
2    3
3    4
4    5
Name: col1, dtype: int64

In [72]:
print(type(df.col1),type(df.col1.values))
df.col1.values

<class 'pandas.core.series.Series'> <class 'numpy.ndarray'>


array([1, 2, 3, 4, 5])

#### Indexing

Pandas indexing: https://pandas.pydata.org/pandas-docs/stable/indexing.html

In [73]:
df.tail()

Unnamed: 0,col1,col2,col3
0,1,a,True
1,2,b,True
2,3,c,False
3,4,d,False
4,5,e,False


In [74]:
# Indexing
print("1st row, 2nd column = ", df.iloc[0,1])
print("2nd row, 2nd column = ", df.loc[1,"col2"])
df[["col1","col2"]]

1st row, 2nd column =  a
2nd row, 2nd column =  b


Unnamed: 0,col1,col2
0,1,a
1,2,b
2,3,c
3,4,d
4,5,e


In [75]:
df.iloc[0]

col1       1
col2       a
col3    True
Name: 0, dtype: object

In [76]:
# Indexing
print(df.iloc[1:3,:])
df.loc[:,"col1":"col3"]

   col1 col2   col3
1     2    b   True
2     3    c  False


Unnamed: 0,col1,col2,col3
0,1,a,True
1,2,b,True
2,3,c,False
3,4,d,False
4,5,e,False


In [77]:
# Filtering (subsetting) by logical indexing
df.col1[(df.col1 >2) & (df.col1 <5)]

2    3
3    4
Name: col1, dtype: int64

In [78]:
df.loc[df.col1>2,"col1":"col2"]

Unnamed: 0,col1,col2
2,3,c
3,4,d
4,5,e


In [79]:
# Filtering (subsetting) by Pandas query method
df.query('col1 > 2 & col1 < 5')

Unnamed: 0,col1,col2,col3
2,3,c,False
3,4,d,False


In [80]:
# Add a column

df['col4'] = [1]*len(df)
print(df.head())
df.loc[:,["col2","col4"]]

   col1 col2   col3  col4
0     1    a   True     1
1     2    b   True     1
2     3    c  False     1
3     4    d  False     1
4     5    e  False     1


Unnamed: 0,col2,col4
0,a,1
1,b,1
2,c,1
3,d,1
4,e,1


#### Column Names

In [81]:
df = pd.DataFrame({"a":(1,2,3,4),"b":(5,6,7,8),
                   "c":(9,10,11,12)})
col_names = df.columns.tolist()
col_names

['a', 'b', 'c']

In [82]:
# Re-arrange Columns
col_names = [col_names[-1]] + col_names[0:-1]
df = df.loc[:,col_names]
df.head()

Unnamed: 0,c,a,b
0,9,1,5
1,10,2,6
2,11,3,7
3,12,4,8


#### Read dataset from file

In [83]:
df = pd.read_csv("daphnia.csv")
df.head()

Unnamed: 0,Growth.rate,Water,Detergent,Daphnia
0,2.919086,Tyne,BrandA,Clone1
1,2.492904,Tyne,BrandA,Clone1
2,3.021804,Tyne,BrandA,Clone1
3,2.350874,Tyne,BrandA,Clone2
4,3.148174,Tyne,BrandA,Clone2


In [84]:
df2 = pd.read_table("Arthritis.txt")
df2.tail()

Unnamed: 0,"ID ""Treatment"" ""Sex"" ""Age"" ""Improved"""
79,"80 32 ""Placebo"" ""Female"" 66 ""None"""
80,"81 42 ""Placebo"" ""Female"" 66 ""None"""
81,"82 15 ""Placebo"" ""Female"" 66 ""Some"""
82,"83 71 ""Placebo"" ""Female"" 68 ""Some"""
83,"84 1 ""Placebo"" ""Female"" 74 ""Marked"""


In [85]:
df3 = pd.read_csv("Arthritis.txt",sep=" ")
df3.head()

Unnamed: 0,ID,Treatment,Sex,Age,Improved
1,57,Treated,Male,27,Some
2,46,Treated,Male,29,
3,77,Treated,Male,30,
4,17,Treated,Male,32,Marked
5,36,Treated,Male,46,Marked


In [86]:
df.Water.head()

0    Tyne
1    Tyne
2    Tyne
3    Tyne
4    Tyne
Name: Water, dtype: object

In [87]:
df.rename(columns ={'Growth.rate': 'Growth'}, inplace =True)
df.head()

Unnamed: 0,Growth,Water,Detergent,Daphnia
0,2.919086,Tyne,BrandA,Clone1
1,2.492904,Tyne,BrandA,Clone1
2,3.021804,Tyne,BrandA,Clone1
3,2.350874,Tyne,BrandA,Clone2
4,3.148174,Tyne,BrandA,Clone2


#### Read URL

In [88]:
import pandas as pd 

url="https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv"
df = pd.read_csv(url) # Pandas version 0.19.2 or later
df.head()

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA
2,Benin,AFRICA
3,Botswana,AFRICA
4,Burkina,AFRICA


#### Attribute references are pointers

In [89]:
df = pd.read_table("daphnia.csv",sep=",")
v = df.loc[:,'Growth.rate']
print("Type of v: ",type(v))
v2 = v.values
print("Type of v2: ",type(v2))

Type of v:  <class 'pandas.core.series.Series'>
Type of v2:  <class 'numpy.ndarray'>


In [90]:
print(v[2])
v[2] = 0
print(v[2])
df.head()

3.0218042510000003
0.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Growth.rate,Water,Detergent,Daphnia
0,2.919086,Tyne,BrandA,Clone1
1,2.492904,Tyne,BrandA,Clone1
2,0.0,Tyne,BrandA,Clone1
3,2.350874,Tyne,BrandA,Clone2
4,3.148174,Tyne,BrandA,Clone2


### Datasets



In [91]:
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [92]:
X = np.asarray(iris)
print("Iris is a ",type(iris),"   X is a ",type(X))
[iris.iloc[3,2], X[3,2]]

Iris is a  <class 'pandas.core.frame.DataFrame'>    X is a  <class 'numpy.ndarray'>


[1.5, 1.5]

In [93]:
iris['sepal_length'].mean()

5.843333333333335

In [94]:
iris.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [None]:
X = X[140:,0:4]
X

In [95]:
sns.get_dataset_names()

['anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'iris',
 'mpg',
 'planets',
 'tips',
 'titanic']