# Review, chapter 5 of 'Python for Data Analysis'

In [2]:
# imports
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

In [3]:
#lets see which python we are using and where it is
#notice its in the data301 environment
#the same environment we were in when we started jupyter lab
!python -V
!which python

Python 3.9.7
/home/keith/anaconda3/envs/data301/bin/python


## Pandas Series

A one dimensional array

In [4]:
#create a series
ds=pd.Series([4,7,-5,3],index=['d','b','a','c'])
ds

d    4
b    7
a   -5
c    3
dtype: int64

### Map - used for series
"The function you pass to map() should expect a single value from the Series (a point value), and return a transformed version of that value. map() returns a new Series where all the values have been transformed by your function."*<br>

*https://www.kaggle.com/residentmario/summary-functions-and-maps

In [5]:
%%time
#apply a function to every value in a series
def sum1(x):
    return x+1

#or use a lambda
f=lambda x: x+2

#use map for series
ds.map(sum1)
ds.map(f)

CPU times: user 318 µs, sys: 0 ns, total: 318 µs
Wall time: 322 µs


d    6
b    9
a   -3
c    5
dtype: int64

In [6]:
# I did not set ds equal to above maps
# so orginal series remains intact
ds

d    4
b    7
a   -5
c    3
dtype: int64

### Reseting the Index
Use this to create a new index, you can save the old one or not<br>
Also useful to turn a pd.Series into a pd.DataFrame with the index and values being the columns<br>
Helpful for turning a groupby object into a dataframe

In [7]:
#changing index
# ds.index
# ds.reset_index()  #moves index to 'index' column
ds1 = ds.reset_index(drop='True')  # stays a pd.Series
ds1 = ds.reset_index()             # becaomes a pd.DataFrame
ds1.head()

Unnamed: 0,index,0
0,d,4
1,b,7
2,a,-5
3,c,3


### Find NaNs

In [8]:
#got any NaN's? (missing data)
pd.isnull(ds)
ds.isnull()

d    False
b    False
a    False
c    False
dtype: bool

d    False
b    False
a    False
c    False
dtype: bool

### <mark>Boolean selection</mark>

Selecting rows based on a boolean condition

In [9]:
ds>0  #show which are > 0
ds[ds>0] #boolean selection, return series elements that meet the condition(>0)

d     True
b     True
a    False
c     True
dtype: bool

d    4
b    7
c    3
dtype: int64

### Sorting

In [10]:
ds.sort_values()

a   -5
c    3
d    4
b    7
dtype: int64

## Pandas Dataframe
A 2 dimensional array

In [11]:
# create a dataframe
df = pd.DataFrame(np.random.randn(4,3), columns = list('bde'), index = ['U','Oh','T','Or'])
df

Unnamed: 0,b,d,e
U,-0.431735,-0.243378,1.598943
Oh,0.519031,-1.121719,0.981223
T,-0.157759,1.10355,0.596289
Or,-2.334816,-0.850763,0.278608


### Look at the first few rows and get the size of it (rows and columns)

In [12]:
df.tail(n=1)

Unnamed: 0,b,d,e
Or,-2.334816,-0.850763,0.278608


In [13]:
#how many rows and columns
df.shape

(4, 3)

### Select items by index
Pandas has its own way of indexing a dataframe;<br>
by string (like a column name), use loc<br>
by number (like a column or row location), use iloc
    

In [14]:
df

Unnamed: 0,b,d,e
U,-0.431735,-0.243378,1.598943
Oh,0.519031,-1.121719,0.981223
T,-0.157759,1.10355,0.596289
Or,-2.334816,-0.850763,0.278608


In [15]:
#first row
# df.iloc[0]
df1 = df.iloc[:, [0,2]]
df1=df.copy()
# print(id(df1.iloc[0,0]))
# print(id(df.iloc[0,0]))


# # #first row second column
# df.iloc[0,1]

df.b  #first column
# df['b']
# type(df.b.U)  #first column, row U
# df.loc['U','b']

# df.loc['Oh':'Or',:]  #last 3 rows all columns
# df.loc[:,['b','d']]  #all rows, last 2 columns

U    -0.431735
Oh    0.519031
T    -0.157759
Or   -2.334816
Name: b, dtype: float64

### Map- applying a function to a single row,column value at a time.  - the function has access to a single value 


In [32]:
#make a copy and create a column that will hold the smallest value of the other 3 columns
dfm=df.copy()

In [33]:
#manipulate it
f=lambda x: x if (x>0.0) else 0

def fun(x):
    return x if (x>0.0) else 0

# df.b.map(f)  #df.b is a series, apply lambda to each value
dfm.b.map(fun) 

# f=lambda x: x.min()-x.max()
# dfm.apply(f,axis=1)  #works on each value in each row
# dfm.apply(f,axis=0)  #works on each value in each column

U     0.000000
Oh    0.519031
T     0.000000
Or    0.000000
Name: b, dtype: float64

### Apply- applying a function to an entire row (or column) at a time - the function will have access to every value in that row or column

<mark>Use this only if you need other values in a row (or column) otherwise prefer map.


In [34]:
#make a copy and create a column that will hold the smallest value of the other 3 columns
dfa=df.copy()
dfa['smallest_val']=np.NaN
dfa

Unnamed: 0,b,d,e,smallest_val
U,-0.431735,-0.243378,1.598943,
Oh,0.519031,-1.121719,0.981223,
T,-0.157759,1.10355,0.596289,
Or,-2.334816,-0.850763,0.278608,


In [36]:
def fun(ser):
    '''
    find the smallest value in the series that is pas
    '''
    return min(ser[0], ser[1], ser[2])


dfa['smallest_val'] = dfa.apply(fun, axis=1)  #operate on each row
dfa

Unnamed: 0,b,d,e,smallest_val
U,-0.431735,-0.243378,1.598943,-0.431735
Oh,0.519031,-1.121719,0.981223,-1.121719
T,-0.157759,1.10355,0.596289,-0.157759
Or,-2.334816,-0.850763,0.278608,-2.334816


### <mark>Boolean selection</mark>

Selecting rows based on a boolean condition

In [40]:
df

Unnamed: 0,b,d,e
U,-0.238594,0.824264,-1.27694
Oh,2.500316,0.119824,0.95575
T,0.575126,0.706261,-1.351273
Or,0.017349,0.911556,-0.378077


In [45]:
# df[df.b>0.1]   #on a single condition
df[(df.b>1.1) & (df.e>0)]  #on multiple conditions (note the & not && or and, note the ()'s)
df

Unnamed: 0,b,d,e
Oh,2.500316,0.119824,0.95575


Unnamed: 0,b,d,e
U,-0.238594,0.824264,-1.27694
Oh,2.500316,0.119824,0.95575
T,0.575126,0.706261,-1.351273
Or,0.017349,0.911556,-0.378077


### Sorting
sort your dataframe

In [47]:
# df.sort_index()   #sort by index (rows)
 df.sort_index(axis='rows') 
df.sort_index(axis='columns', ascending=False)   #sort by index (columns) 

Unnamed: 0,e,d,b
U,-1.27694,0.824264,-0.238594
Oh,0.95575,0.119824,2.500316
T,-1.351273,0.706261,0.575126
Or,-0.378077,0.911556,0.017349


In [48]:
df.sort_values(by='d')

Unnamed: 0,b,d,e
Oh,2.500316,0.119824,0.95575
T,0.575126,0.706261,-1.351273
U,-0.238594,0.824264,-1.27694
Or,0.017349,0.911556,-0.378077


### Descriptive statistics 

Lots of these, see table5-8 (~p.160) in McKinney book<br>
Some really useful ones <br>
<mark>
    info<br>
    describe<br>
    nunique<br>
    value_counts<br>
    is_null

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, U to Or
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   b       4 non-null      float64
 1   d       4 non-null      float64
 2   e       4 non-null      float64
dtypes: float64(3)
memory usage: 300.0+ bytes


In [50]:
df.describe() #lots of stats 
df.describe().b.min()  #pull out min value in column b

Unnamed: 0,b,d,e
count,4.0,4.0,4.0
mean,0.713549,0.640476,-0.512635
std,1.238678,0.35715,1.074203
min,-0.238594,0.119824,-1.351273
25%,-0.046637,0.559651,-1.295523
50%,0.296237,0.765262,-0.827508
75%,1.056424,0.846087,-0.04462
max,2.500316,0.911556,0.95575


-0.2385937739275817

In [53]:
#how many unique rows are there?
df1=df.copy()
df1=df1.append(df1.iloc[3]) #add a duplicate row
df1
df1.value_counts()   #notice the count is 2 for the duplicate row

Unnamed: 0,b,d,e
U,-0.238594,0.824264,-1.27694
Oh,2.500316,0.119824,0.95575
T,0.575126,0.706261,-1.351273
Or,0.017349,0.911556,-0.378077
Or,0.017349,0.911556,-0.378077


b          d         e        
 0.017349  0.911556  -0.378077    2
-0.238594  0.824264  -1.276940    1
 0.575126  0.706261  -1.351273    1
 2.500316  0.119824   0.955750    1
dtype: int64

In [None]:
df1.nunique(axis=0)  #how many unique values per column 
df1.nunique(axis=1)  #how many unique values per row 

In [None]:
df.count(axis=1)  #how many non na per row, axis=0 per column
df.sum()  #same as df.sum(axis=0) 
df.sum(axis=1)

### Find NaNs

In [54]:
df.isnull()  # see if null

df.isnull().sum().sum()  #any at all
df.isnull().sum(axis=1) #any null values in columns?


Unnamed: 0,b,d,e
U,False,False,False
Oh,False,False,False
T,False,False,False
Or,False,False,False


0

U     0
Oh    0
T     0
Or    0
dtype: int64

### Correlation and Covariance

Useful to see if features (columns) are related.  <br>
<mark>This information can be used to remove redundant features, which simplifies a model. Also used when determining which features are the most important to a model (we will get to this).

In [None]:
df.corr()   #all to all
df.cov()
df['b'].corr(df['d'])