# Review, chapter 5 of 'Python for Data Analysis'

In [2]:
# imports
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

In [3]:
#lets see which python we are using and where it is
#notice its in the data301 environment
#the same environment we were in when we started jupyter lab
!python -V
!which python

Python 3.11.9


~/anaconda3/envs/p311/bin/python


## Pandas Series

A one dimensional array

In [4]:
#create a series
ds=pd.Series([4,7,-5,3],index=['d','b','a','c'])
ds

d    4
b    7
a   -5
c    3
dtype: int64

### Map - used for series
"The function you pass to map() should expect a single value from the Series (a point value), and return a transformed version of that value. map() returns a new Series where all the values have been transformed by your function."*<br>
This means the original series is unaltered.<br>

*https://www.kaggle.com/residentmario/summary-functions-and-maps

In [5]:
%%time
#apply a function to every value in a series
def sum1(x):
    return x+1

#or use a lambda
sum2=lambda x: x+2

#use map for series
# ds.map(sum1)
ds.map(sum2)

CPU times: user 37 μs, sys: 94 μs, total: 131 μs
Wall time: 137 μs


d    6
b    9
a   -3
c    5
dtype: int64

In [6]:
#do not see a difference in these 2
%timeit ds.map(sum1)
%timeit ds.map(sum2)

24.9 μs ± 139 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
25.9 μs ± 147 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [7]:
# I did not set ds equal to above maps
# so orginal series remains intact
ds

d    4
b    7
a   -5
c    3
dtype: int64

### Reseting the Index
Use this to create a new index, you can save the old one or not<br>
Also useful to turn a pd.Series into a pd.DataFrame with the index and values being the columns<br>
Helpful for turning a groupby object into a dataframe

In [8]:
ds.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [9]:
#changing index
# ds.index
ds.reset_index()  #moves index to 'index' column, returns a dataframe
ds.reset_index(drop='True')  # drop original index, stays a pd.Series


Unnamed: 0,index,0
0,d,4
1,b,7
2,a,-5
3,c,3


0    4
1    7
2   -5
3    3
dtype: int64

In [10]:
ds2 = ds.reset_index()             # becaomes a pd.DataFrame
ds2.head()

Unnamed: 0,index,0
0,d,4
1,b,7
2,a,-5
3,c,3


In [11]:
#dropping columns, or rows
df = pd.DataFrame(np.arange(12).reshape(3, 4),
                  columns=['A', 'B', 'C', 'D'])

df.drop(['A','B'],axis=1)   #columns
# df.drop(columns=['A','B'])
df.drop([0,1])              #rows

Unnamed: 0,C,D
0,2,3
1,6,7
2,10,11


Unnamed: 0,A,B,C,D
2,8,9,10,11


### Find NaNs

In [12]:
#got any NaN's? (missing data)
pd.isnull(ds)
ds.isnull()
ds.isnull().sum()

d    False
b    False
a    False
c    False
dtype: bool

d    False
b    False
a    False
c    False
dtype: bool

0

### <mark>Boolean selection</mark>

Selecting rows based on a boolean condition

In [13]:
ds>0  #show which are > 0
ds[ds>0] #boolean selection, return series elements that meet the condition(>0)

d     True
b     True
a    False
c     True
dtype: bool

d    4
b    7
c    3
dtype: int64

### Sorting

In [14]:
ds.sort_values()

a   -5
c    3
d    4
b    7
dtype: int64

## Pandas Dataframe
A 2 dimensional array

In [15]:
# create a dataframe
df = pd.DataFrame(np.arange(1,13).reshape(4,3), columns = list('bde'), index = ['U','Oh','T','Or'])
df


Unnamed: 0,b,d,e
U,1,2,3
Oh,4,5,6
T,7,8,9
Or,10,11,12


### Look at the first few rows and get the size of it (rows and columns)

In [16]:
#how many rows and columns
df.shape

(4, 3)

### Select items by index
Pandas has its own way of indexing a dataframe;<br>
by string (like a column name), use loc<br>
by number (like a column or row location), use iloc
    

In [31]:
#first row
# df.iloc[0]

#1st and 3rd columns
df1 = df.iloc[:, [0,2]]
df1
df1=df.copy()

#are they different (copy or a slice?)
#id is guarenteed unique for each object but the same id can be reused 
#after the original object is deleted
print(id(df1.iloc[0,0]))
print(id(df.iloc[0,0]))
print(hex(id(df1))) #can yiurn it into hex number but it is not the real address
print(hex(id(df)))

# # #first row second column
df.iloc[0,1]

df.b  #first column, (pandas series)
df['b']
type(df.b.U)  #first column, row U

#using loc
df.loc['U','b']
# df.loc['Oh':'Or',:]  #last 3 rows all columns
# df3=df.loc[:,['b','d']]  #all rows, last 2 columns

Unnamed: 0,b,e
U,1,3
Oh,4,6
T,7,9
Or,10,12


140459818991888
140459818991824
0x7fbf599e1950
0x7fbf599b2410


2

U      1
Oh     4
T      7
Or    10
Name: b, dtype: int64

U      1
Oh     4
T      7
Or    10
Name: b, dtype: int64

numpy.int64

1

### Map- applying a function to a single row,column value at a time.  - the function has access to a single value 


In [68]:
#make a copy and create a column that will hold the smallest value of the other 3 columns
dfm=df.copy()
dfm

Unnamed: 0,b,d,e
U,1,2,3
Oh,4,5,6
T,7,8,9
Or,10,11,12


In [56]:
#manipulate it
f=lambda x: x if (x%2==0) else 0

def fun(x):
    print(type(x))
    return x if (x>0.0) else 0

dfm.b=dfm.b.map(f)  #df.b is a series, apply lambda to each value
dfm
# dfm.b.map(fun) 

dfm=dfm.map(f)  #map lambda to each value in dfm
dfm

Unnamed: 0,b,d,e
U,0,2,3
Oh,4,5,6
T,0,8,9
Or,10,11,12


Unnamed: 0,b,d,e
U,0,2,0
Oh,4,0,6
T,0,8,0
Or,10,0,12


### Apply- applying a function to an entire row (or column) at a time - the function will have access to every value in that row or column

<mark>Use this only if you need other values in a row (or column) otherwise prefer map.


In [45]:
dfm

Unnamed: 0,b,d,e
U,0,2,3
Oh,4,5,6
T,0,8,9
Or,10,11,12


In [67]:
def fun1(ser):
    # print(type(ser))
    return ser-ser.mean()
dfm.apply(fun1,axis=1)  ##normalize each row
# dfm.apply(fun1,axis=0)  ##normalize each column

#and but original dfm is unchanged
dfm

#change to dfm stick below
dfm=dfm.apply(fun1,axis=0)
dfm

Unnamed: 0,b,d,e
U,0.0,0.0,0.0
Oh,0.0,0.0,0.0
T,0.0,0.0,0.0
Or,0.0,0.0,0.0


Unnamed: 0,b,d,e
U,-4.5,-4.5,-4.5
Oh,-1.5,-1.5,-1.5
T,1.5,1.5,1.5
Or,4.5,4.5,4.5


Unnamed: 0,b,d,e
U,-4.5,-4.5,-4.5
Oh,-1.5,-1.5,-1.5
T,1.5,1.5,1.5
Or,4.5,4.5,4.5


In [76]:
dfm

f=lambda x: x.max()-x.min() #lambda function to find range of values in a series

# the following will return series
dfm.apply(f,axis=1)  #works on each value in each row, will return 1 value per row
dfm.apply(f,axis=0)  #works on each value in each column, will return 1 value per column
dfm

Unnamed: 0,b,d,e
U,1,2,3
Oh,4,5,6
T,7,8,9
Or,10,11,12


U     2
Oh    2
T     2
Or    2
dtype: int64

b    9
d    9
e    9
dtype: int64

Unnamed: 0,b,d,e
U,1,2,3
Oh,4,5,6
T,7,8,9
Or,10,11,12


In [77]:
#make a copy and create a column that will hold the smallest value of the other 3 columns
dfa=df.copy()
dfa['smallest_val']=np.NaN
dfa

Unnamed: 0,b,d,e,smallest_val
U,1,2,3,
Oh,4,5,6,
T,7,8,9,
Or,10,11,12,


In [79]:
def fun(ser):
    '''
    find the smallest value in the series that is pas
    '''
    return min(ser)

%time dfa['smallest_val'] = dfa.apply(fun, axis=1)  #operate on each row
dfa

CPU times: user 597 μs, sys: 0 ns, total: 597 μs
Wall time: 599 μs


Unnamed: 0,b,d,e,smallest_val
U,1,2,3,1.0
Oh,4,5,6,4.0
T,7,8,9,7.0
Or,10,11,12,10.0


In [80]:
#whats the fastest?  See test_vectorize notebook for more details
%timeit dfa.apply(fun, axis=1);             #pass in a series
%timeit dfa.apply(fun, axis=1,raw=True);    #pass in a np.array instead of a series
%timeit dfa.min(axis=1);                    #use built in min function

#note that operating on raw numpy arrays gives roughly a 2x speedup

171 μs ± 311 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
98.3 μs ± 240 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
199 μs ± 652 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### <mark>Boolean selection</mark>

Selecting rows based on a boolean condition

In [81]:
df

Unnamed: 0,b,d,e
U,1,2,3
Oh,4,5,6
T,7,8,9
Or,10,11,12


In [83]:
# df[df.b>0.1]   #on a single condition
df[(df.b>4) & (df.d>8)]  #on multiple conditions (note the & not && or and, note the ()'s)
# df

Unnamed: 0,b,d,e
Or,10,11,12


### Sorting
sort your dataframe

In [86]:
df

# df.sort_index()   #sort by index (rows)
df.sort_index(axis='index') 
df.sort_index(axis='columns', ascending=False)   #sort by index (columns) 

Unnamed: 0,b,d,e
U,1,2,3
Oh,4,5,6
T,7,8,9
Or,10,11,12


Unnamed: 0,b,d,e
Oh,4,5,6
Or,10,11,12
T,7,8,9
U,1,2,3


Unnamed: 0,e,d,b
U,3,2,1
Oh,6,5,4
T,9,8,7
Or,12,11,10


In [88]:
df.sort_values(by='d',ascending=False)  #sort by values in column 'd'

Unnamed: 0,b,d,e
Or,10,11,12
T,7,8,9
Oh,4,5,6
U,1,2,3


### Descriptive statistics 

Lots of these, see table5-8 (~p.160) in McKinney book<br>
Some really useful ones <br>
<mark>
    info<br>
    describe<br>
    nunique<br>
    value_counts<br>
    is_null

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, U to Or
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   b       4 non-null      int64
 1   d       4 non-null      int64
 2   e       4 non-null      int64
dtypes: int64(3)
memory usage: 300.0+ bytes


In [90]:
df.describe() #lots of stats 
# df.describe().b.min()  #pull out min value in column b

Unnamed: 0,b,d,e
count,4.0,4.0,4.0
mean,5.5,6.5,7.5
std,3.872983,3.872983,3.872983
min,1.0,2.0,3.0
25%,3.25,4.25,5.25
50%,5.5,6.5,7.5
75%,7.75,8.75,9.75
max,10.0,11.0,12.0


In [95]:
df
df.iloc[3,:].to_frame()
df.index
df.columns
# df.iloc[3].to_frame()

Unnamed: 0,b,d,e
U,1,2,3
Oh,4,5,6
T,7,8,9
Or,10,11,12


Unnamed: 0,Or
b,10
d,11
e,12


Index(['U', 'Oh', 'T', 'Or'], dtype='object')

Index(['b', 'd', 'e'], dtype='object')

In [97]:
#how many unique rows are there?
df1=df.copy()
df1=pd.concat([df1,df1.iloc[3].to_frame().T],axis=0) #add a duplicate row
df1
# df1.value_counts()   #notice the count is 2 for the duplicate row

Unnamed: 0,b,d,e
U,1,2,3
Oh,4,5,6
T,7,8,9
Or,10,11,12
Or,10,11,12


b   d   e 
10  11  12    2
1   2   3     1
4   5   6     1
7   8   9     1
Name: count, dtype: int64

In [99]:
df1.nunique(axis=0)  #how many unique values per column 
df1.nunique(axis=1)  #how many unique values per row 

b    4
d    4
e    4
dtype: int64

U     3
Oh    3
T     3
Or    3
Or    3
dtype: int64

In [100]:
df1.b.nunique()

4

In [101]:
df.count(axis=1)  #how many non na per row, axis=0 per column
df.sum()  #same as df.sum(axis=0) 
df.sum(axis=1)

U     3
Oh    3
T     3
Or    3
dtype: int64

b    22
d    26
e    30
dtype: int64

U      6
Oh    15
T     24
Or    33
dtype: int64

### Find NaNs

In [102]:
df.isnull()  # see if null

df.isnull().sum().sum()  #any at all
# df.isnull().sum(axis=1) #any null values in columns?


Unnamed: 0,b,d,e
U,False,False,False
Oh,False,False,False
T,False,False,False
Or,False,False,False


0

In [104]:
df.isnull().any(axis=1)  #any null values in rows?
df.isnull().any(axis=0)  #any null values in columns?
df.isnull().any().sum()  #how many columns have null values

U     False
Oh    False
T     False
Or    False
dtype: bool

b    False
d    False
e    False
dtype: bool

0

### Correlation and Covariance

Useful to see if features (columns) are related.  <br>
<mark>This information can be used to remove redundant features, which simplifies a model. Also used when determining which features are the most important to a model (we will get to this).

In [98]:
df.corr()   #all to all
df.cov()
df['b'].corr(df['d'])

Unnamed: 0,b,d,e
b,1.0,0.075322,0.486277
d,0.075322,1.0,-0.209485
e,0.486277,-0.209485,1.0


Unnamed: 0,b,d,e
b,0.028468,0.007158,0.038206
d,0.007158,0.31721,-0.054941
e,0.038206,-0.054941,0.216842


0.07532200643939645

In [50]:
df['b']

U     0.551198
Oh   -0.215282
T     0.374409
Or    0.099456
Name: b, dtype: float64