# Introduction to Data Analysis with Python


<img src="https://www.python.org/static/img/python-logo.png" alt="yogen" style="width: 200px; float: right;"/>
<br>
<br>
<br>


## `pandas`

### Getting started with pandas

In [None]:
import pandas as pd
import numpy as np

### `pandas` data structures

### Series

The base pandas abstraction. You can thing of it as the love child of a numpy array and a dictionary.

In [None]:
s = pd.Series([4, 7, -5, 3])
s

0    4
1    7
2   -5
3    3
dtype: int64

If we provide an index, pandas will use it. If not, it will automatically create one.

In [None]:
print(s.index)
print(s.values)

RangeIndex(start=0, stop=4, step=1)
[ 4  7 -5  3]


In [None]:
list('ifneurh')

['i', 'f', 'n', 'e', 'u', 'r', 'h']

In [None]:
s2 = pd.Series([1, 2, 4.5, 7, 2, 23, 15], index=['i', 'f', 'n', 'e', 'u', 'r', 'h'])
s2

i     1.0
f     2.0
n     4.5
e     7.0
u     2.0
r    23.0
h    15.0
dtype: float64

In [None]:
s2['r']

23.0

In [None]:
s2 > 3

i    False
f    False
n     True
e     True
u    False
r     True
h     True
dtype: bool

In [None]:
s2[s2>3]

n     4.5
e     7.0
r    23.0
h    15.0
dtype: float64

In [None]:
evens = s2 % 2 == 0

In [None]:
s2[evens]

f    2.0
u    2.0
dtype: float64

In [None]:
s2 * 2

i     2.0
f     4.0
n     9.0
e    14.0
u     4.0
r    46.0
h    30.0
dtype: float64

In [None]:
np.exp(s2)

i    2.718282e+00
f    7.389056e+00
n    9.001713e+01
e    1.096633e+03
u    7.389056e+00
r    9.744803e+09
h    3.269017e+06
dtype: float64

In [None]:
'f' in s2

True

In [None]:
clase = pd.Series([34, 22, 45, 72], index=['Toni', 'Fulanito', 'Menganito', 'Victor'])

In [None]:
clase[clase==22].index

Index(['Fulanito'], dtype='object')

We can create Series from dictionaries:

In [None]:
sdata = {'B' : 3e6, 'M': 6e6, 'P': 1.2e5, 'V': 7e5}

s3 = pd.Series(sdata)
s3

B    3000000.0
M    6000000.0
P     120000.0
V     700000.0
dtype: float64

In [None]:
increase = {'M': 4e5, 'B' : 2e5, 'Z': -2e4}

s4 = pd.Series(increase)

And here is where the magic happens: numpy arrays only identify their contents by position. In contrast, pandas knows their "name" and will align them based on their indexes:

In [None]:
s3.values

array([3000000., 6000000.,  120000.,  700000.])

In [None]:
s4.values

array([400000., 200000., -20000.])

In [None]:
s3.values + s4.values

ValueError: operands could not be broadcast together with shapes (4,) (3,) 

In [None]:
s3 + s4

B    3200000.0
M    6400000.0
P          NaN
V          NaN
Z          NaN
dtype: float64

In [None]:
s3.name = 'population_2000'
s3.index.name = 'province'

In [None]:
s3

province
B    3000000.0
M    6000000.0
P     120000.0
V     700000.0
Name: population_2000, dtype: float64

### DataFrame

This is the object you'll work most of the time with. It represents a table of _m_ observations x _n_ variables. Each variable, or column, is a Series.

In [None]:
dfdata = {
    'province' : ['M', 'M', 'M', 'B', 'B'],
    'population': [1.5e6, 2e6, 3e6, 5e5, 1.5e6],
    'year' : [1900, 1950, 2000, 1900, 2000]   
}

df = pd.DataFrame(dfdata)
df

Unnamed: 0,province,population,year
0,M,1500000.0,1900
1,M,2000000.0,1950
2,M,3000000.0,2000
3,B,500000.0,1900
4,B,1500000.0,2000


In [None]:
df2 = pd.DataFrame(dfdata, columns=['province','population', 'year', 'debt'])
df2

Unnamed: 0,province,population,year,debt
0,M,1500000.0,1900,
1,M,2000000.0,1950,
2,M,3000000.0,2000,
3,B,500000.0,1900,
4,B,1500000.0,2000,


In [None]:
df2.index

RangeIndex(start=0, stop=5, step=1)

In [None]:
df2.columns

Index(['province', 'population', 'year', 'debt'], dtype='object')

In [None]:
df2[['population','province']]

Unnamed: 0,population,province
0,1500000.0,M
1,2000000.0,M
2,3000000.0,M
3,500000.0,B
4,1500000.0,B


In [None]:
df2.population

0    1500000.0
1    2000000.0
2    3000000.0
3     500000.0
4    1500000.0
Name: population, dtype: float64

In [None]:
df2['2nd_language']=list('EEFFG')

In [None]:
df2['2nd_language'] = np.nan

In [None]:
df2.index = [list('EEFFG')]

In [None]:
df2

Unnamed: 0,province,population,year,debt,2nd_language
E,M,1500000.0,1900,,
E,M,2000000.0,1950,,
F,M,3000000.0,2000,,
F,B,500000.0,1900,,
G,B,1500000.0,2000,,


In [None]:
df2['2nd_language']

E   NaN
E   NaN
F   NaN
F   NaN
G   NaN
Name: 2nd_language, dtype: float64

In [None]:
df2.2nd_language

SyntaxError: invalid syntax (<ipython-input-112-3f6d9ec47dae>, line 1)

In [None]:
# df2['abs']

In [None]:
df2.index = list('abcde')

In [None]:
df2

Unnamed: 0,province,population,year,debt,2nd_language
a,M,1500000.0,1900,,
b,M,2000000.0,1950,,
c,M,3000000.0,2000,,
d,B,500000.0,1900,,
e,B,1500000.0,2000,,


In [None]:
df2.loc['a']

province              M
population      1.5e+06
year               1900
debt                NaN
2nd_language        NaN
Name: a, dtype: object

In [None]:
df2['debt'] = 10
df2

Unnamed: 0,province,population,year,debt,2nd_language
a,M,1500000.0,1900,10,
b,M,2000000.0,1950,10,
c,M,3000000.0,2000,10,
d,B,500000.0,1900,10,
e,B,1500000.0,2000,10,


In [None]:
df2['debt'] = [1,0,2,.5,.7]
df2

Unnamed: 0,province,population,year,debt,2nd_language
a,M,1500000.0,1900,1.0,
b,M,2000000.0,1950,0.0,
c,M,3000000.0,2000,2.0,
d,B,500000.0,1900,0.5,
e,B,1500000.0,2000,0.7,


In [None]:
df2['capital'] = df2['province'] == 'M'
df2

Unnamed: 0,province,population,year,debt,2nd_language,capital
a,M,1500000.0,1900,1.0,,True
b,M,2000000.0,1950,0.0,,True
c,M,3000000.0,2000,2.0,,True
d,B,500000.0,1900,0.5,,False
e,B,1500000.0,2000,0.7,,False


In [None]:
df2.T

Unnamed: 0,a,b,c,d,e
province,M,M,M,B,B
population,1.5e+06,2e+06,3e+06,500000,1.5e+06
year,1900,1950,2000,1900,2000
debt,1,0,2,0.5,0.7
2nd_language,,,,,
capital,True,True,True,False,False


In [None]:
df2

Unnamed: 0,province,population,year,debt,2nd_language,capital
a,M,1500000.0,1900,1.0,,True
b,M,2000000.0,1950,0.0,,True
c,M,3000000.0,2000,2.0,,True
d,B,500000.0,1900,0.5,,False
e,B,1500000.0,2000,0.7,,False


In [None]:
df2.describe()

Unnamed: 0,population,year,debt,2nd_language
count,5.0,5.0,5.0,0.0
mean,1700000.0,1950.0,0.84,
std,908295.1,50.0,0.74364,
min,500000.0,1900.0,0.0,
25%,1500000.0,1900.0,0.5,
50%,1500000.0,1950.0,0.7,
75%,2000000.0,2000.0,1.0,
max,3000000.0,2000.0,2.0,


In [None]:
df2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
population,5.0,1700000.0,908295.106229,500000.0,1500000.0,1500000.0,2000000.0,3000000.0
year,5.0,1950.0,50.0,1900.0,1900.0,1950.0,2000.0,2000.0
debt,5.0,0.84,0.74364,0.0,0.5,0.7,1.0,2.0
2nd_language,0.0,,,,,,,


### Index objects

Indexes are immutable.

In [None]:
df2.index[1] = 'x'

TypeError: Index does not support mutable operations

In [None]:
df2.index[1]

'b'

In [None]:
df2.iloc[2:]

Unnamed: 0,province,population,year,debt,2nd_language,capital
c,M,3000000.0,2000,2.0,,True
d,B,500000.0,1900,0.5,,False
e,B,1500000.0,2000,0.7,,False


### Dropping entries from an axis

In [None]:
s5 = pd.Series(np.arange(5), list('jduvk'))
s5

j    0
d    1
u    2
v    3
k    4
dtype: int64

In [None]:
s6 = s5.drop(['d','k'])
s6

j    0
u    2
v    3
dtype: int64

In [None]:
s5

j    0
d    1
u    2
v    3
k    4
dtype: int64

In [None]:
s5.drop(['d','k'],inplace=False)

j    0
u    2
v    3
dtype: int64

In [None]:
s5

j    0
d    1
u    2
v    3
k    4
dtype: int64

By default, `drop()` doesn't modify the original Series- it creates a copy. We can change that with the argument `inplace`.

In [None]:
s5

j    0
d    1
u    2
v    3
k    4
dtype: int64

In [None]:
s6['u'] = 7
s5

j    0
d    1
u    2
v    3
k    4
dtype: int64

In [None]:
df2

Unnamed: 0,province,population,year,debt,2nd_language,capital
a,M,1500000.0,1900,1.0,,True
b,M,2000000.0,1950,0.0,,True
c,M,3000000.0,2000,2.0,,True
d,B,500000.0,1900,0.5,,False
e,B,1500000.0,2000,0.7,,False


In [None]:
df2.drop('c')

Unnamed: 0,province,population,year,debt,2nd_language,capital
a,M,1500000.0,1900,1.0,,True
b,M,2000000.0,1950,0.0,,True
d,B,500000.0,1900,0.5,,False
e,B,1500000.0,2000,0.7,,False


In [None]:
df2

Unnamed: 0,province,population,year,debt,2nd_language,capital
a,M,1500000.0,1900,1.0,,True
b,M,2000000.0,1950,0.0,,True
c,M,3000000.0,2000,2.0,,True
d,B,500000.0,1900,0.5,,False
e,B,1500000.0,2000,0.7,,False


In [None]:
df2.drop('c', axis=0)

Unnamed: 0,province,population,year,debt,2nd_language,capital
a,M,1500000.0,1900,1.0,,True
b,M,2000000.0,1950,0.0,,True
d,B,500000.0,1900,0.5,,False
e,B,1500000.0,2000,0.7,,False


In [None]:
df2.drop('2nd_language', axis=1)

Unnamed: 0,province,population,year,debt,capital
a,M,1500000.0,1900,1.0,True
b,M,2000000.0,1950,0.0,True
c,M,3000000.0,2000,2.0,True
d,B,500000.0,1900,0.5,False
e,B,1500000.0,2000,0.7,False


In [None]:
df3 = df2.drop('2nd_language', axis=1)

In [None]:
df3

Unnamed: 0,province,population,year,debt,capital
a,M,1500000.0,1900,1.0,True
b,M,2000000.0,1950,0.0,True
c,M,3000000.0,2000,2.0,True
d,B,500000.0,1900,0.5,False
e,B,1500000.0,2000,0.7,False


In [None]:
df4 = df3

In [None]:
df4.drop(['a','b'],inplace=True)

In [None]:
df4

Unnamed: 0,province,population,year,debt,capital
c,M,3000000.0,2000,2.0,True
d,B,500000.0,1900,0.5,False
e,B,1500000.0,2000,0.7,False


In [None]:
df3

Unnamed: 0,province,population,year,debt,capital
c,M,3000000.0,2000,2.0,True
d,B,500000.0,1900,0.5,False
e,B,1500000.0,2000,0.7,False


In [None]:
df3 = df2.copy()
df3

Unnamed: 0,province,population,year,debt,2nd_language,capital
a,M,1500000.0,1900,1.0,,True
b,M,2000000.0,1950,0.0,,True
c,M,3000000.0,2000,2.0,,True
d,B,500000.0,1900,0.5,,False
e,B,1500000.0,2000,0.7,,False


In [None]:
df3.drop('capital', axis=1, inplace=True)
df3

Unnamed: 0,province,population,year,debt,2nd_language
a,M,1500000.0,1900,1.0,
b,M,2000000.0,1950,0.0,
c,M,3000000.0,2000,2.0,
d,B,500000.0,1900,0.5,
e,B,1500000.0,2000,0.7,


In [None]:
df2

Unnamed: 0,province,population,year,debt,2nd_language,capital
a,M,1500000.0,1900,1.0,,True
b,M,2000000.0,1950,0.0,,True
c,M,3000000.0,2000,2.0,,True
d,B,500000.0,1900,0.5,,False
e,B,1500000.0,2000,0.7,,False


### Indexing, selection, and filtering

The key here is that we can build boolean Series that we can use to index the original Series or DataFrame. Those booleans can be combined with bitwise boolean operators (&, |, ~) to get filters that are as complex as we need. 

In [None]:
s3

province
B    3000000.0
M    6000000.0
P     120000.0
V     700000.0
Name: population_2000, dtype: float64

In [None]:
s3[['V', 'M']]

province
V     700000.0
M    6000000.0
Name: population_2000, dtype: float64

In [None]:
s3[2:]

province
P    120000.0
V    700000.0
Name: population_2000, dtype: float64

In [None]:
s3['P':'V']

province
P    120000.0
V    700000.0
Name: population_2000, dtype: float64

In [None]:
s3 > 1e06

province
B     True
M     True
P    False
V    False
Name: population_2000, dtype: bool

In [None]:
s3[s3>1e06]

province
B    3000000.0
M    6000000.0
Name: population_2000, dtype: float64

In [None]:
df3

Unnamed: 0,province,population,year,debt,2nd_language
a,M,1500000.0,1900,1.0,
b,M,2000000.0,1950,0.0,
c,M,3000000.0,2000,2.0,
d,B,500000.0,1900,0.5,
e,B,1500000.0,2000,0.7,


In [None]:
df3[df3['year'] > 1950]

Unnamed: 0,province,population,year,debt,2nd_language
c,M,3000000.0,2000,2.0,
e,B,1500000.0,2000,0.7,


In [None]:
df3[(df3['year'] > 1900) & (df3['debt'] > 1)]

Unnamed: 0,province,population,year,debt,2nd_language
c,M,3000000.0,2000,2.0,


In [None]:
recent = df3['year'] > 1900
indebted = df3['debt'] > 1

df3[recent & indebted]

Unnamed: 0,province,population,year,debt,2nd_language
c,M,3000000.0,2000,2.0,


In [None]:
df3[df3['year'] > 1900][df3['debt'] > 1]

  """Entry point for launching an IPython kernel.


Unnamed: 0,province,population,year,debt,2nd_language
c,M,3000000.0,2000,2.0,


### Function application and mapping

Function application and mapping allows us to modify the elements of a DataFrame (columns with apply or elements with applymap) without for loops. This way we are not constrained to the functions already implemented by pandas or numpy.

In [None]:
df3

Unnamed: 0,province,population,year,debt,2nd_language
a,M,1500000.0,1900,1.0,
b,M,2000000.0,1950,0.0,
c,M,3000000.0,2000,2.0,
d,B,500000.0,1900,0.5,
e,B,1500000.0,2000,0.7,


In [None]:
np.sqrt(df3['population'])

a    1224.744871
b    1414.213562
c    1732.050808
d     707.106781
e    1224.744871
Name: population, dtype: float64

In [None]:
df4 = pd.DataFrame(np.random.randn(4,3) * 17 + 15, columns=list('bde'), index=list('BMPZ'))
df4

Unnamed: 0,b,d,e
B,16.163299,18.014108,11.478479
M,32.397748,-1.460602,-7.034481
P,-5.880499,15.338894,-3.795441
Z,21.736302,22.656725,-12.350687


In [None]:
np.abs(df4)

Unnamed: 0,b,d,e
B,16.163299,18.014108,11.478479
M,32.397748,1.460602,7.034481
P,5.880499,15.338894,3.795441
Z,21.736302,22.656725,12.350687


This is a typical use case for lambdas (anonymous functions)

In [None]:
df4.apply(lambda series: series.max() - series.min())

b    38.278247
d    24.117328
e    23.829166
dtype: float64

In [None]:
df4.applymap(lambda element: element % 10 )

Unnamed: 0,b,d,e
B,6.163299,8.014108,1.478479
M,2.397748,8.539398,2.965519
P,4.119501,5.338894,6.204559
Z,1.736302,2.656725,7.649313


In [None]:
df4.apply(lambda series: series.max() - series.min(), axis=1)

B     6.535629
M    39.432229
P    21.219393
Z    35.007412
dtype: float64

In [None]:
def f(series):
    return pd.Series([series.max(), series.min()], index=['max', 'min'])

df4.apply(f)

Unnamed: 0,b,d,e
max,32.397748,22.656725,11.478479
min,-5.880499,-1.460602,-12.350687


In [None]:
for item in df4.items():
    print(item)

('b', B    16.163299
M    32.397748
P    -5.880499
Z    21.736302
Name: b, dtype: float64)
('d', B    18.014108
M    -1.460602
P    15.338894
Z    22.656725
Name: d, dtype: float64)
('e', B    11.478479
M    -7.034481
P    -3.795441
Z   -12.350687
Name: e, dtype: float64)


In [None]:
for item in df4.iteritems():
    print(item)

('b', B    16.163299
M    32.397748
P    -5.880499
Z    21.736302
Name: b, dtype: float64)
('d', B    18.014108
M    -1.460602
P    15.338894
Z    22.656725
Name: d, dtype: float64)
('e', B    11.478479
M    -7.034481
P    -3.795441
Z   -12.350687
Name: e, dtype: float64)


In [None]:
map(f, [1,2])

<map at 0x7f22108da358>

In [None]:
def format_2digits(number):
    return '%.2f' % number

In [None]:
df4.applymap(format_2digits)

Unnamed: 0,b,d,e
B,16.16,18.01,11.48
M,32.4,-1.46,-7.03
P,-5.88,15.34,-3.8
Z,21.74,22.66,-12.35


### Sorting and ranking

In [None]:
df4.sort_index(ascending=False)

Unnamed: 0,b,d,e
Z,21.736302,22.656725,-12.350687
P,-5.880499,15.338894,-3.795441
M,32.397748,-1.460602,-7.034481
B,16.163299,18.014108,11.478479


In [None]:
df4.sort_index(ascending=False, axis=1)

Unnamed: 0,e,d,b
B,11.478479,18.014108,16.163299
M,-7.034481,-1.460602,32.397748
P,-3.795441,15.338894,-5.880499
Z,-12.350687,22.656725,21.736302


In [None]:
df4.sort_values(by='e')

Unnamed: 0,b,d,e
Z,21.736302,22.656725,-12.350687
M,32.397748,-1.460602,-7.034481
P,-5.880499,15.338894,-3.795441
B,16.163299,18.014108,11.478479


In [None]:
df4.sort_values(by=['e','b'])

Unnamed: 0,b,d,e
Z,21.736302,22.656725,-12.350687
M,32.397748,-1.460602,-7.034481
P,-5.880499,15.338894,-3.795441
B,16.163299,18.014108,11.478479


In [None]:
s1 = pd.Series([2,3,8,4,3,2,1], index=list('abcdefg'))
s1

a    2
b    3
c    8
d    4
e    3
f    2
g    1
dtype: int64

In [None]:
s1.sort_values()

g    1
a    2
f    2
b    3
e    3
d    4
c    8
dtype: int64

rank() returns the positions of the elements of the Series in its sorted version. If there are ties, it will take averages.

In [None]:
s1.rank()

a    2.5
b    4.5
c    7.0
d    6.0
e    4.5
f    2.5
g    1.0
dtype: float64

In [None]:
pd.Series([1,1,1]).rank()

0    2.0
1    2.0
2    2.0
dtype: float64

In [None]:
s2 = pd.Series([30,10,20], index=list('abc'))
s2

a    30
b    10
c    20
dtype: int64

In [None]:
s2.rank()

a    3.0
b    1.0
c    2.0
dtype: float64

In [None]:
help(s2.rank)

Help on method rank in module pandas.core.generic:

rank(axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False) method of pandas.core.series.Series instance
    Compute numerical data ranks (1 through n) along axis. Equal values are
    assigned a rank that is the average of the ranks of those values
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        index to direct ranking
    method : {'average', 'min', 'max', 'first', 'dense'}
        * average: average rank of group
        * min: lowest rank in group
        * max: highest rank in group
        * first: ranks assigned in order they appear in the array
        * dense: like 'min', but rank always increases by 1 between groups
    numeric_only : boolean, default None
        Include only float, int, boolean data. Valid only for DataFrame or
        Panel objects
    na_option : {'keep', 'top', 'bottom'}
        * keep: leave NA values where they are
    

# Exercise 1

Write a function that takes a Series and returns the top 10% registers. In this case, earners. Test it with this Series:

```python
salaries = pd.Series([150000, 90000, 120000,30000,10000,5000,40000, 50000, 80000, 35000, 27000,14000, 28000, 22000,25000])
```

# Exercise 2


In [None]:
salaries = pd.Series([150000, 90000, 120000,30000,10000,5000,40000, 50000, 80000, 35000, 27000,14000, 28000, 22000,25000])

In [None]:
def top_earners(serie):
    number_to_extract = round(len(serie) / 10)
    return salaries.sort_values()[-number_to_extract:]

top_earners(salaries)

2    120000
0    150000
dtype: int64

In [None]:
def top_earners(serie, percentile=0.9):
    is_top_earner = serie.rank(pct=True) > percentile
    return serie[is_top_earner]

print(top_earners(salaries))
print(top_earners(salaries, .8))

0    150000
2    120000
dtype: int64
0    150000
1     90000
2    120000
dtype: int64


## Summarizing and computing descriptive statistics

In [None]:
x = pd.Series([1.2, np.nan, 4, np.nan, 9], index=list('abcde'))
y = pd.Series([5, 3, 7, np.nan, 14], index=list('abcde'))

df = pd.DataFrame([x, y], index=['x','y']).T
df

Unnamed: 0,x,y
a,1.2,5.0
b,,3.0
c,4.0,7.0
d,,
e,9.0,14.0


In [None]:
df.sum()

x    14.2
y    29.0
dtype: float64

As with many methods, we can use them in the direction perpendicular to their default.

In [None]:
df.sum(axis=1)

a     6.2
b     3.0
c    11.0
d     NaN
e    23.0
dtype: float64

In [None]:
pd.__version__

'0.20.3'

In [None]:
df.sum(axis=1, skipna=False)

a     6.2
b     NaN
c    11.0
d     NaN
e    23.0
dtype: float64

In [None]:
df.mean()

x    4.733333
y    7.250000
dtype: float64

In [None]:
df.mean(axis=1)

a     3.1
b     3.0
c     5.5
d     NaN
e    11.5
dtype: float64

In [None]:
df.cumsum()


Unnamed: 0,x,y
a,1.2,5.0
b,,8.0
c,5.2,15.0
d,,
e,14.2,29.0


In [None]:
df.std()

x    3.951371
y    4.787136
dtype: float64

In [None]:
df.describe()

Unnamed: 0,x,y
count,3.0,4.0
mean,4.733333,7.25
std,3.951371,4.787136
min,1.2,3.0
25%,2.6,4.5
50%,4.0,6.0
75%,6.5,8.75
max,9.0,14.0


In [None]:
df['x'].sum()

14.199999999999999

In [None]:
df['x'].describe()

count    3.000000
mean     4.733333
std      3.951371
min      1.200000
25%      2.600000
50%      4.000000
75%      6.500000
max      9.000000
Name: x, dtype: float64

### Unique values, value counts, and membership

In [None]:
s7 = pd.Series(list('gtcaaagcttcga'))
s7

0     g
1     t
2     c
3     a
4     a
5     a
6     g
7     c
8     t
9     t
10    c
11    g
12    a
dtype: object

In [None]:
s7.unique()

array(['g', 't', 'c', 'a'], dtype=object)

In [None]:
s7.value_counts()

a    4
t    3
c    3
g    3
dtype: int64

In [None]:
puric_bases = ['a','g']
s7.isin(puric_bases)

0      True
1     False
2     False
3      True
4      True
5      True
6      True
7     False
8     False
9     False
10    False
11     True
12     True
dtype: bool

In [None]:
s7[s7.isin(puric_bases)]

0     g
3     a
4     a
5     a
6     g
11    g
12    a
dtype: object

## Handling missing data

In [None]:
string_data = pd.Series(['Ma', 'Lu', 'Ca', 'Va', np.nan])
string_data

0     Ma
1     Lu
2     Ca
3     Va
4    NaN
dtype: object

In [None]:
string_data[string_data!=np.nan]

0     Ma
1     Lu
2     Ca
3     Va
4    NaN
dtype: object

This is weird... but it has some really good reasons. You can find explanations [here](https://stackoverflow.com/questions/10034149/why-is-nan-not-equal-to-nan) and [here](https://stackoverflow.com/questions/1565164/what-is-the-rationale-for-all-comparisons-returning-false-for-ieee754-nan-values)

In [None]:
np.nan == np.nan

False

In [None]:
string_data[~string_data.isnull()]

0    Ma
1    Lu
2    Ca
3    Va
dtype: object

### Filtering out missing data

In [None]:
string_data[string_data.notnull()]

0    Ma
1    Lu
2    Ca
3    Va
dtype: object

In [None]:
df5 = pd.DataFrame([[1,2,3], 
                    [np.nan, 8, 7], 
                    [4, np.nan, 90], 
                    [67,42,53]], 
                   columns=list('abc'))
df5

Unnamed: 0,a,b,c
0,1.0,2.0,3
1,,8.0,7
2,4.0,,90
3,67.0,42.0,53


In [None]:
df5[df5['a'].notnull()]

Unnamed: 0,a,b,c
0,1.0,2.0,3
2,4.0,,90
3,67.0,42.0,53


In [None]:
df5.notnull()

Unnamed: 0,a,b,c
0,True,True,True
1,False,True,True
2,True,False,True
3,True,True,True


any() and all() are functions of boolean Series. They reduce the Series to a single boolean value by applying repeatedly the operators "or" and "and", respectively.

In [None]:
df5.notnull().any()

a    True
b    True
c    True
dtype: bool

In [None]:
df5.notnull().all()

a    False
b    False
c     True
dtype: bool

In [None]:
df5.isnull().any()

a     True
b     True
c    False
dtype: bool

In [None]:
df5.dropna()

Unnamed: 0,a,b,c
0,1.0,2.0,3
3,67.0,42.0,53


In [None]:
df5

Unnamed: 0,a,b,c
0,1.0,2.0,3
1,,8.0,7
2,4.0,,90
3,67.0,42.0,53


In [None]:
df5.dropna(axis=1)

Unnamed: 0,c
0,3
1,7
2,90
3,53


In [None]:
array = np.random.randn(8,3) * 20 + 100

df6 = pd.DataFrame(array, columns=list('xyz'), index=list('abcdefgh'))
df6.iloc[2:5, 1] = np.nan
df6.iloc[1:3, 2] = np.nan
df6

Unnamed: 0,x,y,z
a,76.452797,114.632266,119.351134
b,90.190712,69.871955,
c,73.358187,,
d,104.300649,,58.799242
e,82.351267,,90.104265
f,108.385869,95.028381,81.396693
g,117.942247,112.941936,58.095202
h,102.463076,92.616793,108.007647


The thresh argument specifies the minimum number of non-null values required to keep a column (or row, with axis=1)

In [None]:
df6.dropna(thresh=2)

Unnamed: 0,x,y,z
a,76.452797,114.632266,119.351134
b,90.190712,69.871955,
d,104.300649,,58.799242
e,82.351267,,90.104265
f,108.385869,95.028381,81.396693
g,117.942247,112.941936,58.095202
h,102.463076,92.616793,108.007647


In [None]:
df6.dropna(thresh=2, axis=1)

Unnamed: 0,x,y,z
a,76.452797,114.632266,119.351134
b,90.190712,69.871955,
c,73.358187,,
d,104.300649,,58.799242
e,82.351267,,90.104265
f,108.385869,95.028381,81.396693
g,117.942247,112.941936,58.095202
h,102.463076,92.616793,108.007647


In [None]:
df6.dropna(thresh=6, axis=1)

Unnamed: 0,x,z
a,76.452797,119.351134
b,90.190712,
c,73.358187,
d,104.300649,58.799242
e,82.351267,90.104265
f,108.385869,81.396693
g,117.942247,58.095202
h,102.463076,108.007647


### Filling in missing data

In [None]:
df6.fillna(0)

Unnamed: 0,x,y,z
a,76.452797,114.632266,119.351134
b,90.190712,69.871955,0.0
c,73.358187,0.0,0.0
d,104.300649,0.0,58.799242
e,82.351267,0.0,90.104265
f,108.385869,95.028381,81.396693
g,117.942247,112.941936,58.095202
h,102.463076,92.616793,108.007647


In [None]:
df6.fillna({'x' : 100, 'y' : 50, 'z' : 20})

Unnamed: 0,x,y,z
a,76.452797,114.632266,119.351134
b,90.190712,69.871955,20.0
c,73.358187,50.0,20.0
d,104.300649,50.0,58.799242
e,82.351267,50.0,90.104265
f,108.385869,95.028381,81.396693
g,117.942247,112.941936,58.095202
h,102.463076,92.616793,108.007647


In [None]:
df6

Unnamed: 0,x,y,z
a,76.452797,114.632266,119.351134
b,90.190712,69.871955,
c,73.358187,,
d,104.300649,,58.799242
e,82.351267,,90.104265
f,108.385869,95.028381,81.396693
g,117.942247,112.941936,58.095202
h,102.463076,92.616793,108.007647


In [None]:
df6.fillna(method='ffill')

Unnamed: 0,x,y,z
a,76.452797,114.632266,119.351134
b,90.190712,69.871955,119.351134
c,73.358187,69.871955,119.351134
d,104.300649,69.871955,58.799242
e,82.351267,69.871955,90.104265
f,108.385869,95.028381,81.396693
g,117.942247,112.941936,58.095202
h,102.463076,92.616793,108.007647


In [None]:
df6.fillna(df6.median())

Unnamed: 0,x,y,z
a,76.452797,114.632266,119.351134
b,90.190712,69.871955,85.750479
c,73.358187,95.028381,85.750479
d,104.300649,95.028381,58.799242
e,82.351267,95.028381,90.104265
f,108.385869,95.028381,81.396693
g,117.942247,112.941936,58.095202
h,102.463076,92.616793,108.007647


In [None]:
df6.median()

x    96.326894
y    95.028381
z    85.750479
dtype: float64

# Additional References

[Python for Data Analysis](http://shop.oreilly.com/product/0636920023784.do)

[What is SciPy?](https://www.scipy.org/)

[How can SciPy be fast if it is written in an interpreted language like Python?](https://www.scipy.org/scipylib/faq.html#how-can-scipy-be-fast-if-it-is-written-in-an-interpreted-language-like-python)

[What is the difference between NumPy and SciPy?](https://www.scipy.org/scipylib/faq.html#what-is-the-difference-between-numpy-and-scipy)

[Linear Algebra for AI](https://github.com/fastai/fastai/blob/master/tutorials/linalg_pytorch.ipynb)