# Review, chapter 5 of 'Python for Data Analysis'

In [52]:
# imports
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [53]:
#lets see which python we are using and where it is
#notice its in the data301 environment
#the same environment we were in when we started jupyter lab
!python -V
!which python

Python 3.11.5


/home/keith/anaconda3/envs/p311/bin/python


## Pandas Series

A one dimensional array

In [54]:
#create a series
ds=pd.Series([4,7,-5,3],index=['d','b','a','c'])
ds

d    4
b    7
a   -5
c    3
dtype: int64

### Map - used for series
"The function you pass to map() should expect a single value from the Series (a point value), and return a transformed version of that value. map() returns a new Series where all the values have been transformed by your function."*<br>
This means the original series is unaltered.<br>

*https://www.kaggle.com/residentmario/summary-functions-and-maps

In [57]:
%%time
#apply a function to every value in a series
def sum1(x):
    return x+1

#or use a lambda
f=lambda x: x+2

#use map for series
ds.map(sum1)
ds.map(f)

CPU times: user 595 µs, sys: 17 µs, total: 612 µs
Wall time: 623 µs


d    6
b    9
a   -3
c    5
dtype: int64

In [60]:
#do not see a difference in these 2
%timeit ds.map(sum1)

%timeit ds.map(f)

30.9 µs ± 555 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
30.4 µs ± 187 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [61]:
# I did not set ds equal to above maps
# so orginal series remains intact
ds

d    4
b    7
a   -5
c    3
dtype: int64

### Reseting the Index
Use this to create a new index, you can save the old one or not<br>
Also useful to turn a pd.Series into a pd.DataFrame with the index and values being the columns<br>
Helpful for turning a groupby object into a dataframe

In [64]:
#changing index
# ds.index
ds.reset_index()  #moves index to 'index' column
ds1 = ds.reset_index(drop='True')  # stays a pd.Series
ds1

Unnamed: 0,index,0
0,d,4
1,b,7
2,a,-5
3,c,3


0    4
1    7
2   -5
3    3
dtype: int64

In [65]:
ds2 = ds.reset_index()             # becaomes a pd.DataFrame
ds2.head()

Unnamed: 0,index,0
0,d,4
1,b,7
2,a,-5
3,c,3


In [66]:
#dropping columns, or rows
df = pd.DataFrame(np.arange(12).reshape(3, 4),
                  columns=['A', 'B', 'C', 'D'])

df.drop(['A','B'],axis=1)
# df.drop(columns=['A','B'])
df.drop([0,1])

Unnamed: 0,C,D
0,2,3
1,6,7
2,10,11


Unnamed: 0,A,B,C,D
2,8,9,10,11


### Find NaNs

In [72]:
#got any NaN's? (missing data)
pd.isnull(ds)
ds.isnull()
ds.isnull().sum()

d    False
b    False
a    False
c    False
dtype: bool

d    False
b    False
a    False
c    False
dtype: bool

0

### <mark>Boolean selection</mark>

Selecting rows based on a boolean condition

In [14]:
ds

d    4
b    7
a   -5
c    3
dtype: int64

In [15]:
ds>0  #show which are > 0
ds[ds>0] #boolean selection, return series elements that meet the condition(>0)

d     True
b     True
a    False
c     True
dtype: bool

d    4
b    7
c    3
dtype: int64

### Sorting

In [16]:
ds.sort_values()

a   -5
c    3
d    4
b    7
dtype: int64

## Pandas Dataframe
A 2 dimensional array

In [73]:
# create a dataframe
df = pd.DataFrame(np.random.randn(4,3), columns = list('bde'), index = ['U','Oh','T','Or'])
df

Unnamed: 0,b,d,e
U,-0.618374,0.54733,-0.886016
Oh,-0.320729,1.271309,-0.133125
T,-0.64749,0.673557,-0.003218
Or,-0.365562,-0.102006,0.170648


### Look at the first few rows and get the size of it (rows and columns)

In [74]:
#how many rows and columns
df.shape

(4, 3)

### Select items by index
Pandas has its own way of indexing a dataframe;<br>
by string (like a column name), use loc<br>
by number (like a column or row location), use iloc
    

In [19]:
df

Unnamed: 0,b,d,e
U,0.551198,-1.461235,-2.316674
Oh,-0.215282,0.763288,0.395152
T,0.374409,-0.968805,-1.327059
Or,0.099456,0.710524,-0.820877


In [20]:
#first row
# df.iloc[0]

#1st and 3rd columns
df1 = df.iloc[:, [0,2]]
df1=df.copy()
# print(id(df1.iloc[0,0]))
# print(id(df.iloc[0,0]))


# # #first row second column
# df.iloc[0,1]

# df.b  #first column
# df['b']
type(df.b.U)  #first column, row U
df.loc['U','b']

df.loc['Oh':'Or',:]  #last 3 rows all columns
df3=df.loc[:,['b','d']]  #all rows, last 2 columns

numpy.float64

0.551198249752694

Unnamed: 0,b,d,e
Oh,-0.215282,0.763288,0.395152
T,0.374409,-0.968805,-1.327059
Or,0.099456,0.710524,-0.820877


Unnamed: 0,b,d
U,0.551198,-1.461235
Oh,-0.215282,0.763288
T,0.374409,-0.968805
Or,0.099456,0.710524


### Map- applying a function to a single row,column value at a time.  - the function has access to a single value 


In [75]:
#make a copy and create a column that will hold the smallest value of the other 3 columns
dfm=df.copy()
dfm

Unnamed: 0,b,d,e
U,-0.618374,0.54733,-0.886016
Oh,-0.320729,1.271309,-0.133125
T,-0.64749,0.673557,-0.003218
Or,-0.365562,-0.102006,0.170648


In [77]:
#manipulate it
f=lambda x: x if (x>0.0) else 0

def fun(x):
    print(type(x))
    return x if (x>0.0) else 0

# df.b.map(f)  #df.b is a series, apply lambda to each value
dfm.b.map(fun) 

<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>


U     0
Oh    0
T     0
Or    0
Name: b, dtype: int64

### Apply- applying a function to an entire row (or column) at a time - the function will have access to every value in that row or column

<mark>Use this only if you need other values in a row (or column) otherwise prefer map.


In [79]:
dfm

Unnamed: 0,b,d,e
U,-0.618374,0.54733,-0.886016
Oh,-0.320729,1.271309,-0.133125
T,-0.64749,0.673557,-0.003218
Or,-0.365562,-0.102006,0.170648


In [80]:
def fun1(ser):
    # print(type(ser))
    return ser-ser.mean()
# dfm.apply(fun1,axis=1)  ##normalize each row
dfm.apply(fun1,axis=0)  ##normalize each column

Unnamed: 0,b,d,e
U,-0.130335,-0.050217,-0.673088
Oh,0.16731,0.673761,0.079802
T,-0.159451,0.07601,0.20971
Or,0.122477,-0.699554,0.383576


In [24]:
f=lambda x: x.min()-x.max()
# the following will return series
dfm.apply(f,axis=1)  #works on each value in each row, will return 1 value per row
# dfm.apply(f,axis=0)  #works on each value in each column, use this form to normalize values (x-x.mean())

U    -2.867873
Oh   -0.978570
T    -1.701467
Or   -1.531402
dtype: float64

In [81]:
#make a copy and create a column that will hold the smallest value of the other 3 columns
dfa=df.copy()
dfa['smallest_val']=np.NaN
dfa

Unnamed: 0,b,d,e,smallest_val
U,-0.618374,0.54733,-0.886016,
Oh,-0.320729,1.271309,-0.133125,
T,-0.64749,0.673557,-0.003218,
Or,-0.365562,-0.102006,0.170648,


In [87]:
def fun(ser):
    '''
    find the smallest value in the series that is pas
    '''
    return min(ser)

%timeit dfa['smallest_val'] = dfa.apply(fun, axis=1)  #operate on each row
dfa

220 µs ± 17.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


Unnamed: 0,b,d,e,smallest_val
U,-0.618374,0.54733,-0.886016,-0.886016
Oh,-0.320729,1.271309,-0.133125,-0.320729
T,-0.64749,0.673557,-0.003218,-0.64749
Or,-0.365562,-0.102006,0.170648,-0.365562


In [88]:
#whats the fastest?  See test_vectorize notebook for more details
%timeit dfa.apply(fun, axis=1);             #pass in a series
%timeit dfa.apply(fun, axis=1,raw=True);    #pass in a np.array instead of a series
%timeit dfa.min(axis=1);                    #use built in min function

165 µs ± 967 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
88.5 µs ± 1.88 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
205 µs ± 2.53 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### <mark>Boolean selection</mark>

Selecting rows based on a boolean condition

In [40]:
df

Unnamed: 0,b,d,e
U,0.551198,-1.461235,-2.316674
Oh,-0.215282,0.763288,0.395152
T,0.374409,-0.968805,-1.327059
Or,0.099456,0.710524,-0.820877


In [44]:
# df[df.b>0.1]   #on a single condition
df[(df.b>0) & (df.d>0)]  #on multiple conditions (note the & not && or and, note the ()'s)
# df

Unnamed: 0,b,d,e
Or,0.099456,0.710524,-0.820877


### Sorting
sort your dataframe

In [45]:
# df.sort_index()   #sort by index (rows)
df.sort_index(axis='rows') 
df.sort_index(axis='columns', ascending=False)   #sort by index (columns) 

Unnamed: 0,b,d,e
Oh,-0.215282,0.763288,0.395152
Or,0.099456,0.710524,-0.820877
T,0.374409,-0.968805,-1.327059
U,0.551198,-1.461235,-2.316674


Unnamed: 0,e,d,b
U,-2.316674,-1.461235,0.551198
Oh,0.395152,0.763288,-0.215282
T,-1.327059,-0.968805,0.374409
Or,-0.820877,0.710524,0.099456


In [46]:
df.sort_values(by='d')

Unnamed: 0,b,d,e
U,0.551198,-1.461235,-2.316674
T,0.374409,-0.968805,-1.327059
Or,0.099456,0.710524,-0.820877
Oh,-0.215282,0.763288,0.395152


### Descriptive statistics 

Lots of these, see table5-8 (~p.160) in McKinney book<br>
Some really useful ones <br>
<mark>
    info<br>
    describe<br>
    nunique<br>
    value_counts<br>
    is_null

In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, U to Or
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   b       4 non-null      float64
 1   d       4 non-null      float64
 2   e       4 non-null      float64
dtypes: float64(3)
memory usage: 300.0+ bytes


In [90]:
df.describe() #lots of stats 
# df.describe().b.min()  #pull out min value in column b

Unnamed: 0,b,d,e
count,4.0,4.0,4.0
mean,-0.488039,0.597547,-0.212928
std,0.168726,0.563214,0.465663
min,-0.64749,-0.102006,-0.886016
25%,-0.625653,0.384996,-0.321348
50%,-0.491968,0.610444,-0.068172
75%,-0.354354,0.822995,0.040249
max,-0.320729,1.271309,0.170648


In [35]:
df

Unnamed: 0,b,d,e
U,0.569476,0.886046,0.813224
Oh,-0.838372,-1.22553,-1.131952
T,1.217232,0.33248,-0.528114
Or,0.328523,-0.192828,-1.076879


In [51]:
df.iloc[3].to_frame()

Unnamed: 0,Or
b,0.328523
d,-0.192828
e,-1.076879


In [91]:
#how many unique rows are there?
df1=df.copy()
df1=pd.concat([df1,df1.iloc[3].to_frame().T],axis=0) #add a duplicate row
df1
# df1.value_counts()   #notice the count is 2 for the duplicate row

Unnamed: 0,b,d,e
U,-0.618374,0.54733,-0.886016
Oh,-0.320729,1.271309,-0.133125
T,-0.64749,0.673557,-0.003218
Or,-0.365562,-0.102006,0.170648
Or,-0.365562,-0.102006,0.170648


In [92]:
df1.value_counts()   #notice the count is 2 for the duplicate row

b          d          e        
-0.365562  -0.102006   0.170648    2
-0.647490   0.673557  -0.003218    1
-0.618374   0.547330  -0.886016    1
-0.320729   1.271309  -0.133125    1
Name: count, dtype: int64

In [93]:
df1.nunique(axis=0)  #how many unique values per column 
df1.nunique(axis=1)  #how many unique values per row 

b    4
d    4
e    4
dtype: int64

U     3
Oh    3
T     3
Or    3
Or    3
dtype: int64

In [96]:
df1.b.nunique()

4

In [49]:
df.count(axis=1)  #how many non na per row, axis=0 per column
df.sum()  #same as df.sum(axis=0) 
df.sum(axis=1)

U     3
Oh    3
T     3
Or    3
dtype: int64

b    0.809781
d   -0.956227
e   -4.069458
dtype: float64

U    -3.226711
Oh    0.943158
T    -1.921455
Or   -0.010897
dtype: float64

### Find NaNs

In [97]:
df.isnull()  # see if null

df.isnull().sum().sum()  #any at all
# df.isnull().sum(axis=1) #any null values in columns?


Unnamed: 0,b,d,e
U,False,False,False
Oh,False,False,False
T,False,False,False
Or,False,False,False


0

### Correlation and Covariance

Useful to see if features (columns) are related.  <br>
<mark>This information can be used to remove redundant features, which simplifies a model. Also used when determining which features are the most important to a model (we will get to this).

In [98]:
df.corr()   #all to all
df.cov()
df['b'].corr(df['d'])

Unnamed: 0,b,d,e
b,1.0,0.075322,0.486277
d,0.075322,1.0,-0.209485
e,0.486277,-0.209485,1.0


Unnamed: 0,b,d,e
b,0.028468,0.007158,0.038206
d,0.007158,0.31721,-0.054941
e,0.038206,-0.054941,0.216842


0.07532200643939645

In [50]:
df['b']

U     0.551198
Oh   -0.215282
T     0.374409
Or    0.099456
Name: b, dtype: float64