# Andrew Podhorecki
## 11/17/2020 WK13
### Python Pandas Grouping and Mapping

In [1]:
import numpy as np

import pandas as pd
from pandas import Series, DataFrame

In [2]:
df = DataFrame({'City' : ['Alma','Brian Head','Fox Park', 'Lake Mary'],
               'Altitude' : [10355,9800,9062,8966]})

df

Unnamed: 0,City,Altitude
0,Alma,10355
1,Brian Head,9800
2,Fox Park,9062
3,Lake Mary,8966


In [4]:
st_map={'Alma':'CO','Brian Head':'UT','Fox Park':'WY','Lake Mary':'CA'}

st_map

{'Alma': 'CO', 'Brian Head': 'UT', 'Fox Park': 'WY', 'Lake Mary': 'CA'}

In [5]:
df['State'] = df['City'].map(st_map)

In [6]:
df

Unnamed: 0,City,Altitude,State
0,Alma,10355,CO
1,Brian Head,9800,UT
2,Fox Park,9062,WY
3,Lake Mary,8966,CA


In [7]:
# apply a function to dataframe

def meter(x):
    return x / 3.2808

In [9]:
# calls function as object and applies it

df['Altitude'] = df['Altitude'].apply(meter)

In [10]:
df

Unnamed: 0,City,Altitude,State
0,Alma,3156.24238,CO
1,Brian Head,2987.076323,UT
2,Fox Park,2762.131188,WY
3,Lake Mary,2732.870032,CA


In [11]:
df = DataFrame(np.random.randn(4,3), columns = list('bde'), index = ['UT','OH','TX', 'OR'])

In [12]:
df

Unnamed: 0,b,d,e
UT,0.143008,0.424562,1.842612
OH,-0.161691,0.594121,0.489201
TX,-0.916113,-0.369746,1.693674
OR,1.936249,0.393999,-0.430631


In [13]:
def dif(x):
    return x.max() - x.min()

In [14]:
#applies and returns difference b/w max and min
# of columns

df.apply(dif)

b    2.852361
d    0.963867
e    2.273243
dtype: float64

In [15]:
# of rows

df.apply(dif, axis=1)

UT    1.699605
OH    0.755811
TX    2.609786
OR    2.366880
dtype: float64

In [16]:
#pre defined function
df.apply(pd.value_counts)

Unnamed: 0,b,d,e
-0.916113,1.0,,
-0.430631,,,1.0
-0.369746,,1.0,
-0.161691,1.0,,
0.143008,1.0,,
0.393999,,1.0,
0.424562,,1.0,
0.489201,,,1.0
0.594121,,1.0,
1.693674,,,1.0


In [17]:
# group by

df = DataFrame({'K1' : ['a','a','b','b','c'],
               'K2' : ['one','two','one','two','one'],
               'dataset1' : np.random.randn(5),
               'dataset2' : np.random.randn(5)})

In [18]:
df

Unnamed: 0,K1,K2,dataset1,dataset2
0,a,one,-0.445545,1.914505
1,a,two,0.146189,-1.479429
2,b,one,-0.550533,-1.305074
3,b,two,-1.703287,1.387333
4,c,one,1.884028,-0.871182


In [20]:
df.groupby('K1').count()

Unnamed: 0_level_0,K2,dataset1,dataset2
K1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,2,2
b,2,2,2
c,1,1,1


In [21]:
df.groupby('K1').size()

K1
a    2
b    2
c    1
dtype: int64

In [22]:
df.groupby(['K1','K2']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
K1,K2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,1,1
a,two,1,1
b,one,1,1
b,two,1,1
c,one,1,1


In [23]:
df

Unnamed: 0,K1,K2,dataset1,dataset2
0,a,one,-0.445545,1.914505
1,a,two,0.146189,-1.479429
2,b,one,-0.550533,-1.305074
3,b,two,-1.703287,1.387333
4,c,one,1.884028,-0.871182


In [25]:
# what you end statement with is what you see (ex mean vs count)

df['dataset1'].groupby(df['K1']).mean()

K1
a   -0.149678
b   -1.126910
c    1.884028
Name: dataset1, dtype: float64

In [28]:
df.groupby('K1').mean()

Unnamed: 0_level_0,dataset1,dataset2
K1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.149678,0.217538
b,-1.12691,0.041129
c,1.884028,-0.871182


In [29]:
df.groupby(['K1','K2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
K1,K2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.445545,1.914505
a,two,0.146189,-1.479429
b,one,-0.550533,-1.305074
b,two,-1.703287,1.387333
c,one,1.884028,-0.871182
