# Review, chapter 5 of 'Python for Data Analysis'

In [None]:
# imports
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

In [None]:
# install if not there
# once installed its in this kernel forever
# !pip install hdbscan
# !pip install folium

In [None]:
#lets see which python we are using and where it is
#notice its in the data301 environment
#the same environment we were in when we started jupyter lab
!python -V
!which python

## Pandas Series

A one dimensional array

In [None]:
#create a series
ds=pd.Series([4,7,-5,3],index=['d','b','a','c'])
ds

In [None]:
#apply a function to every value in a series
def sum1(x):
    return x+1

#or use a lambda
f=lambda x: x+2

ds.map(sum1)
ds.map(f)

In [None]:
#changing index
ds.index
ds.reset_index(drop='True')

In [None]:
#got any NaN's? (missing data)
pd.isnull(ds)
ds.isnull()

In [None]:
#type ds and then hit tab to get autocomplete suggestions
# ds.

### <mark>Boolean selection</mark>

Selecting rows based on a boolean condition

In [None]:
ds>0  #show which are > 0
ds[ds>0] #boolean selection, return series elements that meet the condition(>0)

## Pandas Dataframe
A 2 dimensional array

In [None]:
# create a dataframe
df=pd.DataFrame(np.random.randn(4,3),columns=list('bde'), index=['U','Oh','T','Or'])
df

### Applying a lambda or a function to various parts of a dataframe

In [None]:
#manipulate it
f=lambda x: x if (x>0.0) else 0
df.b.apply(f)  #df.b is a series, apply lambda to each value

f=lambda x: x.min()-x.max()
df.apply(f,axis=1)  #works on each row
df.apply(f,axis=0)  #works on each column

### <mark>Boolean selection</mark>

Selecting rows based on a boolean condition

In [None]:
df[df.b>0]   #on a single condition
df[(df.b>0) & (df.e>0)]  #on multiple conditions (note the & not && or and, note the ()'s)

### Sorting
sort your dataframe

In [None]:
df.sort_index()   #sort by index (rows)
df.sort_index(axis='columns', ascending=False)   #sort by index (columns) 

In [None]:
df.sort_values(by='d')

### Descriptive statistics 

Lots of these, see table5-8 (~p.160) in McKinney book<br>
Some really useful ones <br>
<mark>
    info<br>
    describe<br>
    nunique<br>
    value_counts<br>
    is_null

In [None]:
df.info()

In [None]:
df.describe() #lots of stats 
df.describe().b.min()  #pull out min value in column b

In [None]:
#how many unique rows are there?
df1=df.copy()
df1=df1.append(df1.iloc[3]) #add a duplicate row
df1
df1.value_counts()   #notice the count is 2 for the duplicate row

In [None]:

df1.nunique(axis=0)  #how many unique values per column 
df1.nunique(axis=1)  #how many unique values per row 

In [None]:
df.count(axis=1)  #how many non na per row, axis=0 per column
df.sum()  #same as df.sum(axis=0) 
df.sum(axis=1)

In [None]:
df.isnull()  # see if null

df.isnull().sum().sum()  #any at all
df.isnull().sum(axis=1) #any null values in columns?


### Correlation and Covariance

Useful to see if features (columns) are related.  <br>
<mark>This information can be used to remove redundant features, which simplifies a model. Also used when determining which features are the most important to a model (we will get to this).

In [None]:
df.corr()   #all to all
df.cov()
df['b'].corr(df['d'])