# Advanced Dataframes

## Part 3 - Reshaping
- crosstab
- pivot_table

In [1]:
#standard imports
import pandas as pd
import numpy as np

np.random.seed(1349)

In [2]:
# Create list of values for names column.
students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# Randomly generate arrays of scores for each student for each subject.
# Note that all the values need to have the same length here.
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

# Randomly generate if a student is in classroom A or classroom B
classroom = np.random.choice(['A', 'B'], len(students))

df = pd.DataFrame({'name':students,
                   'math':math_grades,
                   'reading':reading_grades,
                   'english':english_grades,
                   'room':classroom})

#calculated overall grade and letter
df['overall_grade'] = df[['math','reading','english']].mean(axis=1).round(1)

#np.where(condition, output if true, output if false)
df['letter_standing'] = np.where(df.overall_grade >= 90, 'A', 
                                np.where(df.overall_grade >= 80, 'B','C'))

df

Unnamed: 0,name,math,reading,english,room,overall_grade,letter_standing
0,Sally,78,73,84,B,78.3,C
1,Jane,77,98,79,B,84.7,B
2,Suzie,96,91,91,A,92.7,A
3,Billy,62,72,88,B,74.0,C
4,Ada,98,92,88,B,92.7,A
5,John,95,64,73,A,77.3,C
6,Thomas,87,82,80,A,83.0,B
7,Marie,99,94,85,A,92.7,A
8,Albert,91,87,77,B,85.0,B
9,Richard,84,94,61,A,79.7,C


In [None]:
# two layer groupby:
# how many students have a standing grade
# of A and are in classroom B?

In [4]:
# df, where df.room is B, 
# and df.letter_standing is A
df[
    (df.room == 'B') 
    & 
    (df.letter_standing == 'A')
].shape[0]

1

In [6]:
df.groupby(
    ['room',
     'letter_standing'])['name'].count()

room  letter_standing
A     A                  2
      B                  2
      C                  2
B     A                  1
      B                  2
      C                  3
Name: name, dtype: int64

## Using `.crosstab`
count the number of occurances between each subgroup
- format: `pd.crosstab(index=_, column=_)`

#### find all counts of the room and overall letter grade

In [8]:
# pd.crosstab?

In [10]:
# pd.crosstab(
# series 1,
# series 2,)
pd.crosstab(df.room, df.letter_standing)

letter_standing,A,B,C
room,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2,2,2
B,1,2,3


#### add total counts

In [11]:
#margins
pd.crosstab(df.room, 
            df.letter_standing,
           margins=True)

letter_standing,A,B,C,All
room,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,2,2,2,6
B,1,2,3,6
All,3,4,5,12


#### give percentages instead of counts

In [13]:
df.room.value_counts(normalize=True)

room
B    0.5
A    0.5
Name: proportion, dtype: float64

In [15]:
#normalize
pd.crosstab(
    df.room, 
    df.letter_standing,
normalize=True,
margins=True)

letter_standing,A,B,C,All
room,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,0.166667,0.166667,0.166667,0.5
B,0.083333,0.166667,0.25,0.5
All,0.25,0.333333,0.416667,1.0


In [16]:
#normalize
pd.crosstab(
    df.room, 
    df.letter_standing,
normalize='index')

letter_standing,A,B,C
room,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.333333,0.333333,0.333333
B,0.166667,0.333333,0.5


In [17]:
#normalize
pd.crosstab(
    df.room, 
    df.letter_standing,
normalize='columns')

letter_standing,A,B,C
room,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.666667,0.5,0.4
B,0.333333,0.5,0.6


### Using `.pivot_table`

creates a summary table similar to excel pivot table 

need to include 3 of the following:
- which values will make up the rows (the index)
- which values will make up the columns
- the values we are aggregating
- an aggregation method (aggfunc); if we can omit this, and mean will be used by default

format: `pd.pivot_table(data=_, index=_, columns=_, values=_, aggfunc=_)`

#### show the average math grade by overall letter grade and room

In [21]:
#all people in room A with an 
# overall letter grade of B
df[
    (df.room == 'A') 
    & 
    (df.letter_standing == 'B')
].math.mean()

82.0

In [23]:
pd.pivot_table(
    data=df,
    index='room',
    values='math',
    columns='letter_standing')

letter_standing,A,B,C
room,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,97.5,82.0,89.5
B,98.0,84.0,74.333333


#### show the MAX math grade by overall letter grade and room

In [26]:
pd.pivot_table(
    #data: where this is all being calculated from
    data=df,
    # what we want on the left side of the output
    index='room',
    #what we want on the right side of the output
    columns='letter_standing',
    # what values we want to pass a function to in the cells
    values='math',
    # what function we want to do to those math vals
    # (im saying to do max, it does mean by default)
    aggfunc='max')    

letter_standing,A,B,C
room,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,99,87,95
B,98,91,83


> `index` and `columns` should be categorical values, and `values` should be numeric

### Using `.map`

Use a dictionary to assign new values based on current values

- format: `.map({'old_value':'new_value'})`

#### create new df of restuarant data

In [27]:
n = 40

df = pd.DataFrame({
    'drink': np.random.choice(['Tea', 'Water', 'Water'], n),
    
    'meal': np.random.choice(['Curry', 'Yakisoba Noodle', 'Pad Thai'], n),
})

df.sample(10)

Unnamed: 0,drink,meal
32,Water,Yakisoba Noodle
27,Water,Curry
21,Water,Yakisoba Noodle
10,Tea,Pad Thai
38,Water,Pad Thai
1,Water,Curry
24,Water,Yakisoba Noodle
16,Tea,Pad Thai
34,Water,Yakisoba Noodle
13,Water,Pad Thai


#### what are all unique drink values

In [28]:
df.drink.nunique()

2

#### what are all unique meal values

In [29]:
df.meal.unique()

array(['Pad Thai', 'Curry', 'Yakisoba Noodle'], dtype=object)

#### create dictionary of prices for all current items

In [30]:
prices = {
    'Yakisoba Noodle': 9,
    'Curry': 11,
    'Pad Thai': 10,
    'Tea': 2,
    'Water': 0,
}

In [33]:
pd.concat(
    [df['meal'],
    df.meal.map(prices)],
axis=1)

Unnamed: 0,meal,meal.1
0,Pad Thai,10
1,Curry,11
2,Pad Thai,10
3,Pad Thai,10
4,Curry,11
5,Curry,11
6,Curry,11
7,Pad Thai,10
8,Pad Thai,10
9,Curry,11


#### map drink prices and meal prices to create bill

In [34]:
# make a new column called bill
# assign it to the added series of:
# 1: the mapped prices of the food
# 2: the mapped prices of drinks
df['bill'] = \
df.meal.map(prices) \
+ \
df.drink.map(prices)

In [36]:
df.tail()

Unnamed: 0,drink,meal,bill
35,Tea,Pad Thai,12
36,Water,Pad Thai,10
37,Tea,Pad Thai,12
38,Water,Pad Thai,10
39,Water,Curry,11


#### how many orders of each combination of meal and drink are there?

In [None]:
# if they were lists maybe itertools.product

In [38]:
# but its pandas!
pd.crosstab(df.drink, df.meal).T

drink,Tea,Water
meal,Unnamed: 1_level_1,Unnamed: 2_level_1
Curry,1,8
Pad Thai,5,12
Yakisoba Noodle,6,8


#### what percentage of each?

In [39]:
pd.crosstab(df.drink,
            df.meal,
           normalize=True).T

drink,Tea,Water
meal,Unnamed: 1_level_1,Unnamed: 2_level_1
Curry,0.025,0.2
Pad Thai,0.125,0.3
Yakisoba Noodle,0.15,0.2


#### whats the average bill for each combination?

In [40]:
pd.pivot_table(
    data=df,
    index='drink',
    columns='meal',
    values='bill'
)

meal,Curry,Pad Thai,Yakisoba Noodle
drink,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tea,13,12,11
Water,11,10,9


#### we can also get this by using a groupby

In [42]:
df.groupby(
    [
    'drink',
     'meal'
    ]
).bill.mean()

drink  meal           
Tea    Curry              13.0
       Pad Thai           12.0
       Yakisoba Noodle    11.0
Water  Curry              11.0
       Pad Thai           10.0
       Yakisoba Noodle     9.0
Name: bill, dtype: float64