# Review, chapter 5 of 'Python for Data Analysis'

In [13]:
# imports
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

In [14]:
#lets see which python we are using and where it is
#notice its in the data301 environment
#the same environment we were in when we started jupyter lab
!python -V
!which python

Python 3.9.15
/home/keith/anaconda3/envs/data301s23/bin/python


## Pandas Series

A one dimensional array

In [15]:
#create a series
ds=pd.Series([4,7,-5,3],index=['d','b','a','c'])
ds

d    4
b    7
a   -5
c    3
dtype: int64

### Map - used for series
"The function you pass to map() should expect a single value from the Series (a point value), and return a transformed version of that value. map() returns a new Series where all the values have been transformed by your function."*<br>
This means the original series is unaltered.<br>

*https://www.kaggle.com/residentmario/summary-functions-and-maps

In [16]:
%%time
#apply a function to every value in a series
def sum1(x):
    return x+1

#or use a lambda
f=lambda x: x+2

#use map for series
ds.map(sum1)
ds.map(f)

CPU times: user 205 µs, sys: 31 µs, total: 236 µs
Wall time: 240 µs


d    6
b    9
a   -3
c    5
dtype: int64

In [17]:
#cannot really see a difference in these 2
%time
ds.map(sum1)

# %time
# ds.map(f)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 2.86 µs


d    5
b    8
a   -4
c    4
dtype: int64

In [18]:
# I did not set ds equal to above maps
# so orginal series remains intact
ds

d    4
b    7
a   -5
c    3
dtype: int64

### Reseting the Index
Use this to create a new index, you can save the old one or not<br>
Also useful to turn a pd.Series into a pd.DataFrame with the index and values being the columns<br>
Helpful for turning a groupby object into a dataframe

In [19]:
#changing index
# ds.index
# ds.reset_index()  #moves index to 'index' column
ds1 = ds.reset_index(drop='True')  # stays a pd.Series
ds1

0    4
1    7
2   -5
3    3
dtype: int64

In [20]:
ds2 = ds.reset_index()             # becaomes a pd.DataFrame
ds2.head()

Unnamed: 0,index,0
0,d,4
1,b,7
2,a,-5
3,c,3


In [21]:
#dropping columns, or rows
df = pd.DataFrame(np.arange(12).reshape(3, 4),
                  columns=['A', 'B', 'C', 'D'])

df.drop(['A','B'],axis=1)
# df.drop(columns=['A','B'])
df.drop([0,1])

Unnamed: 0,C,D
0,2,3
1,6,7
2,10,11


Unnamed: 0,A,B,C,D
2,8,9,10,11


### Find NaNs

In [22]:
#got any NaN's? (missing data)
pd.isnull(ds)
ds.isnull()

d    False
b    False
a    False
c    False
dtype: bool

d    False
b    False
a    False
c    False
dtype: bool

### <mark>Boolean selection</mark>

Selecting rows based on a boolean condition

In [23]:
ds>0  #show which are > 0
ds[ds>0] #boolean selection, return series elements that meet the condition(>0)

d     True
b     True
a    False
c     True
dtype: bool

d    4
b    7
c    3
dtype: int64

### Sorting

In [24]:
ds.sort_values()

a   -5
c    3
d    4
b    7
dtype: int64

## Pandas Dataframe
A 2 dimensional array

In [25]:
# create a dataframe
df = pd.DataFrame(np.random.randn(4,3), columns = list('bde'), index = ['U','Oh','T','Or'])
df

Unnamed: 0,b,d,e
U,0.51098,1.567912,-0.535403
Oh,-0.227593,0.560617,-0.505811
T,-0.59852,0.563959,0.225066
Or,-0.216719,0.453543,-0.759196


### Look at the first few rows and get the size of it (rows and columns)

In [26]:
df.tail(n=1)

Unnamed: 0,b,d,e
Or,-0.216719,0.453543,-0.759196


In [27]:
#how many rows and columns
df.shape

(4, 3)

### Select items by index
Pandas has its own way of indexing a dataframe;<br>
by string (like a column name), use loc<br>
by number (like a column or row location), use iloc
    

In [28]:
df

Unnamed: 0,b,d,e
U,0.51098,1.567912,-0.535403
Oh,-0.227593,0.560617,-0.505811
T,-0.59852,0.563959,0.225066
Or,-0.216719,0.453543,-0.759196


In [29]:
#first row
# df.iloc[0]
df1 = df.iloc[:, [0,2]]
df1=df.copy()
# print(id(df1.iloc[0,0]))
# print(id(df.iloc[0,0]))


# # #first row second column
# df.iloc[0,1]

df.b  #first column
# df['b']
# type(df.b.U)  #first column, row U
# df.loc['U','b']

# df.loc['Oh':'Or',:]  #last 3 rows all columns
# df.loc[:,['b','d']]  #all rows, last 2 columns

U     0.510980
Oh   -0.227593
T    -0.598520
Or   -0.216719
Name: b, dtype: float64

### Map- applying a function to a single row,column value at a time.  - the function has access to a single value 


In [37]:
#make a copy and create a column that will hold the smallest value of the other 3 columns
dfm=df.copy()
dfm

Unnamed: 0,b,d,e
U,0.51098,1.567912,-0.535403
Oh,-0.227593,0.560617,-0.505811
T,-0.59852,0.563959,0.225066
Or,-0.216719,0.453543,-0.759196


In [33]:
#manipulate it
f=lambda x: x if (x>0.0) else 0

def fun(x):
    return x if (x>0.0) else 0

# df.b.map(f)  #df.b is a series, apply lambda to each value
dfm.b.map(fun) 

U     0.51098
Oh    0.00000
T     0.00000
Or    0.00000
Name: b, dtype: float64

U    -2.103315
Oh   -1.066428
T    -1.162478
Or   -1.212738
dtype: float64

### Apply- applying a function to an entire row (or column) at a time - the function will have access to every value in that row or column

<mark>Use this only if you need other values in a row (or column) otherwise prefer map.


In [60]:
def fun1(ser):return ser-ser.mean()
dfm.apply(fun1,axis=1)  ##normalize each row
dfm.apply(fun1,axis=0)  ##normalize each column

Unnamed: 0,b,d,e
U,-0.003516,1.053416,-1.0499
Oh,-0.169997,0.618213,-0.448215
T,-0.662022,0.500457,0.161565
Or,-0.042595,0.627667,-0.585072


Unnamed: 0,b,d,e
U,0.643943,0.781404,-0.141567
Oh,-0.09463,-0.225891,-0.111975
T,-0.465557,-0.222549,0.618902
Or,-0.083756,-0.332965,-0.36536


In [62]:
f=lambda x: x.min()-x.max()
# the following will return series
dfm.apply(f,axis=1)  #works on each value in each row, will return 1 value per row
# dfm.apply(f,axis=0)  #works on each value in each column, use this form to normalize values (x-x.mean())

U    -2.103315
Oh   -1.066428
T    -1.162478
Or   -1.212738
dtype: float64

In [63]:
#make a copy and create a column that will hold the smallest value of the other 3 columns
dfa=df.copy()
dfa['smallest_val']=np.NaN
dfa

Unnamed: 0,b,d,e,smallest_val
U,0.51098,1.567912,-0.535403,
Oh,-0.227593,0.560617,-0.505811,
T,-0.59852,0.563959,0.225066,
Or,-0.216719,0.453543,-0.759196,


In [65]:
def fun(ser):
    '''
    find the smallest value in the series that is pas
    '''
    return min(ser)

dfa['smallest_val'] = dfa.apply(fun, axis=1)  #operate on each row
dfa

Unnamed: 0,b,d,e,smallest_val
U,0.51098,1.567912,-0.535403,-0.535403
Oh,-0.227593,0.560617,-0.505811,-0.505811
T,-0.59852,0.563959,0.225066,-0.59852
Or,-0.216719,0.453543,-0.759196,-0.759196


### <mark>Boolean selection</mark>

Selecting rows based on a boolean condition

In [66]:
df

Unnamed: 0,b,d,e
U,0.51098,1.567912,-0.535403
Oh,-0.227593,0.560617,-0.505811
T,-0.59852,0.563959,0.225066
Or,-0.216719,0.453543,-0.759196


In [67]:
# df[df.b>0.1]   #on a single condition
df[(df.b>1.1) & (df.e>0)]  #on multiple conditions (note the & not && or and, note the ()'s)
df

Unnamed: 0,b,d,e


Unnamed: 0,b,d,e
U,0.51098,1.567912,-0.535403
Oh,-0.227593,0.560617,-0.505811
T,-0.59852,0.563959,0.225066
Or,-0.216719,0.453543,-0.759196


### Sorting
sort your dataframe

In [69]:
# df.sort_index()   #sort by index (rows)
df.sort_index(axis='rows') 
df.sort_index(axis='columns', ascending=False)   #sort by index (columns) 

Unnamed: 0,b,d,e
Oh,-0.227593,0.560617,-0.505811
Or,-0.216719,0.453543,-0.759196
T,-0.59852,0.563959,0.225066
U,0.51098,1.567912,-0.535403


Unnamed: 0,e,d,b
U,-0.535403,1.567912,0.51098
Oh,-0.505811,0.560617,-0.227593
T,0.225066,0.563959,-0.59852
Or,-0.759196,0.453543,-0.216719


In [70]:
df.sort_values(by='d')

Unnamed: 0,b,d,e
Or,-0.216719,0.453543,-0.759196
Oh,-0.227593,0.560617,-0.505811
T,-0.59852,0.563959,0.225066
U,0.51098,1.567912,-0.535403


### Descriptive statistics 

Lots of these, see table5-8 (~p.160) in McKinney book<br>
Some really useful ones <br>
<mark>
    info<br>
    describe<br>
    nunique<br>
    value_counts<br>
    is_null

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, U to Or
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   b       4 non-null      float64
 1   d       4 non-null      float64
 2   e       4 non-null      float64
dtypes: float64(3)
memory usage: 300.0+ bytes


In [72]:
df.describe() #lots of stats 
df.describe().b.min()  #pull out min value in column b

Unnamed: 0,b,d,e
count,4.0,4.0,4.0
mean,-0.132963,0.786508,-0.393836
std,0.464534,0.523454,0.427827
min,-0.59852,0.453543,-0.759196
25%,-0.320325,0.533848,-0.591351
50%,-0.222156,0.562288,-0.520607
75%,-0.034794,0.814947,-0.323092
max,0.51098,1.567912,0.225066


-0.5985198960584742

In [82]:
#how many unique rows are there?
df1=df.copy()
df1=df1.append(df1.iloc[3]) #add a duplicate row
df1
df1.value_counts()   #notice the count is 2 for the duplicate row

  df1=df1.append(df1.iloc[3]) #add a duplicate row


Unnamed: 0,b,d,e
U,0.51098,1.567912,-0.535403
Oh,-0.227593,0.560617,-0.505811
T,-0.59852,0.563959,0.225066
Or,-0.216719,0.453543,-0.759196
Or,-0.216719,0.453543,-0.759196


b          d         e        
-0.216719  0.453543  -0.759196    2
-0.598520  0.563959   0.225066    1
-0.227593  0.560617  -0.505811    1
 0.510980  1.567912  -0.535403    1
dtype: int64

In [83]:
df1.nunique(axis=0)  #how many unique values per column 
df1.nunique(axis=1)  #how many unique values per row 

b    4
d    4
e    4
dtype: int64

U     3
Oh    3
T     3
Or    3
Or    3
dtype: int64

In [None]:
df.count(axis=1)  #how many non na per row, axis=0 per column
df.sum()  #same as df.sum(axis=0) 
df.sum(axis=1)

### Find NaNs

In [84]:
df.isnull()  # see if null

df.isnull().sum().sum()  #any at all
df.isnull().sum(axis=1) #any null values in columns?


Unnamed: 0,b,d,e
U,False,False,False
Oh,False,False,False
T,False,False,False
Or,False,False,False


0

U     0
Oh    0
T     0
Or    0
dtype: int64

### Correlation and Covariance

Useful to see if features (columns) are related.  <br>
<mark>This information can be used to remove redundant features, which simplifies a model. Also used when determining which features are the most important to a model (we will get to this).

In [85]:
df.corr()   #all to all
df.cov()
df['b'].corr(df['d'])

Unnamed: 0,b,d,e
b,1.0,0.899335,-0.567069
d,0.899335,1.0,-0.150945
e,-0.567069,-0.150945,1.0


Unnamed: 0,b,d,e
b,0.215792,0.218684,-0.112699
d,0.218684,0.274004,-0.033804
e,-0.112699,-0.033804,0.183036


0.8993347093582175