# Review, chapter 5 of 'Python for Data Analysis'

In [54]:
# imports
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

In [55]:
# install if not there
# once installed its in this kernel forever
# !pip install hdbscan
# !pip install folium

In [56]:
#lets see which python we are using and where it is
#notice its in the data301 environment
#the same environment we were in when we started jupyter lab
!python -V
!which python

Python 3.9.7
/home/keith/anaconda3/envs/data301/bin/python


## Pandas Series

A one dimensional array

In [57]:
#create a series
ds=pd.Series([4,7,-5,3],index=['d','b','a','c'])
ds

d    4
b    7
a   -5
c    3
dtype: int64

In [58]:
#apply a function to every value in a series
def sum1(x):
    return x+1

#or use a lambda
f=lambda x: x+2

ds.map(sum1)
ds.map(f)

d    5
b    8
a   -4
c    4
dtype: int64

d    6
b    9
a   -3
c    5
dtype: int64

In [59]:
#changing index
ds.index
ds.reset_index(drop='True')

Index(['d', 'b', 'a', 'c'], dtype='object')

0    4
1    7
2   -5
3    3
dtype: int64

In [60]:
#got any NaN's? (missing data)
pd.isnull(ds)
ds.isnull()

d    False
b    False
a    False
c    False
dtype: bool

d    False
b    False
a    False
c    False
dtype: bool

In [None]:
#type ds and then hit tab to get autocomplete suggestions
ds.

### <mark>Boolean selection</mark>

Selecting rows based on a boolean condition

In [61]:
ds>0  #show which are > 0
ds[ds>0] #boolean selection, return series elements that meet the condition(>0)

d     True
b     True
a    False
c     True
dtype: bool

d    4
b    7
c    3
dtype: int64

## Pandas Dataframe
A 2 dimensional array

In [62]:
# create a dataframe
df=pd.DataFrame(np.random.randn(4,3),columns=list('bde'), index=['U','Oh','T','Or'])
df

Unnamed: 0,b,d,e
U,-1.304911,-1.031287,-1.022411
Oh,-0.335794,-1.282856,0.374975
T,0.300866,-0.74848,-0.016884
Or,-0.273379,0.559436,0.248281


### Applying a lambda or a function to various parts of a dataframe

In [76]:
#manipulate it
f=lambda x: x if (x>0.0) else 0
df.b.apply(f)  #df.b is a series, apply lambda to each value

f=lambda x: x.min()-x.max()
df.apply(f,axis=1)  #works on each row
df.apply(f,axis=0)  #works on each column

U     0.000000
Oh    0.000000
T     0.300866
Or    0.000000
Name: b, dtype: float64

U    -0.282501
Oh   -1.657831
T    -1.049346
Or   -0.832815
dtype: float64

b   -1.605777
d   -1.842291
e   -1.397386
dtype: float64

### <mark>Boolean selection</mark>

Selecting rows based on a boolean condition

In [46]:
df[df.b>0]   #on a single condition
df[(df.b>0) & (df.e>0)]  #on multiple conditions (note the & not && or and, note the ()'s)

Unnamed: 0,b,d,e
Oh,0.34865,-0.795806,-1.007826
T,0.391178,1.018067,1.507522
Or,0.483166,0.455755,-0.511769


Unnamed: 0,b,d,e
T,0.391178,1.018067,1.507522


### Sorting
sort your dataframe

In [50]:
df.sort_index()   #sort by index (rows)
df.sort_index(axis='columns', ascending=False)   #sort by index (columns) 

Unnamed: 0,b,d,e
Oh,0.34865,-0.795806,-1.007826
Or,0.483166,0.455755,-0.511769
T,0.391178,1.018067,1.507522
U,-0.985685,1.496711,0.239664


Unnamed: 0,e,d,b
U,0.239664,1.496711,-0.985685
Oh,-1.007826,-0.795806,0.34865
T,1.507522,1.018067,0.391178
Or,-0.511769,0.455755,0.483166


In [77]:
df.sort_values(by='d')

Unnamed: 0,b,d,e
Oh,-0.335794,-1.282856,0.374975
U,-1.304911,-1.031287,-1.022411
T,0.300866,-0.74848,-0.016884
Or,-0.273379,0.559436,0.248281


### Descriptive statistics 

Lots of these, see table5-8 (~p.160) in McKinney book<br>
Some really useful ones <br>
<mark>
    describe<br>
    nunique<br>
    value_counts

In [92]:
df.describe() #lots of stats 
df.describe().b.min()  #pull out min value in column b

Unnamed: 0,b,d,e
count,4.0,4.0,4.0
mean,-0.403305,-0.625797,-0.104009
std,0.66588,0.819751,0.633663
min,-1.304911,-1.282856,-1.022411
25%,-0.578074,-1.094179,-0.268265
50%,-0.304587,-0.889884,0.115699
75%,-0.129818,-0.421501,0.279955
max,0.300866,0.559436,0.374975


-1.3049113670372299

In [107]:
df1.nunique(axis=0)  #how many unique values per column 
df1.nunique(axis=1)  #how many unique values per row 

b    4
d    4
e    4
dtype: int64

In [104]:
#how many unique rows are there?
df1=df.copy()
df1=df1.append(df1.iloc[3]) #add a duplicate row
df1
df1.value_counts()   #notice the count is 2 for the duplicate row

Unnamed: 0,b,d,e
U,-1.304911,-1.031287,-1.022411
Oh,-0.335794,-1.282856,0.374975
T,0.300866,-0.74848,-0.016884
Or,-0.273379,0.559436,0.248281
Or,-0.273379,0.559436,0.248281


b          d          e        
-0.273379   0.559436   0.248281    2
-1.304911  -1.031287  -1.022411    1
-0.335794  -1.282856   0.374975    1
 0.300866  -0.748480  -0.016884    1
dtype: int64

In [93]:
df.count(axis=1)  #how many non na per row, axis=0 per column
df.sum()  #same as df.sum(axis=0) 
df.sum(axis=1)

U     3
Oh    3
T     3
Or    3
dtype: int64

b   -1.613219
d   -2.503187
e   -0.416038
dtype: float64

U    -3.358610
Oh   -1.243675
T    -0.464497
Or    0.534338
dtype: float64

### Correlation and Covariance

Useful to see if features (columns) are related.  <br>
<mark>This information can be used to remove redundant features, which simplifies a model. Also used when determining which features are the most important to a model (we will get to this).

In [110]:
df.corr()   #all to all
df.cov()
df['b'].corr(df['d'])

Unnamed: 0,b,d,e
b,1.0,0.237448,0.764317
d,0.237448,1.0,0.298099
e,0.764317,0.298099,1.0


Unnamed: 0,b,d,e
b,0.443396,0.129613,0.322499
d,0.129613,0.671992,0.154847
e,0.322499,0.154847,0.401529


0.23744819441726706