# Grouping for Aggregation, Filtration, and Transformation

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 15, 'display.max_rows', 10, 'display.max_colwidth', 12)

## Introduction

* 데이터분석의 기본 : Split - Apply - Combine (분할 - 적용 - 병합)
* Pandas에서 제공하는 .groupby 메서드를 활용하면 데이터를 다양한 방식으로 그룹화하고 각 그룹에 독립적인 함수를 적용한 다음 단일 데이터셋을 반환할 수 있음
---
df.groupby(['list', 'of', 'grouping', 'columns'])  
df.groupby('single_column')   

In [2]:
# .groupby 메서드 호출 결과는 groupby 객체, 다른 메서드 처럼 직관적이지는 않으며
# 메서드를 체인시켜야 그룹화 결과를 확인할 수 있다.
flights = pd.read_csv('data/flights.csv')
flights.groupby('MONTH')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001A2C0EBF370>

### Defining an Aggregation

* .groupby 메서드의 가장 일반적인 용도는 집계(aggregation)을 수행하는 것
* 집계 : 많은 입력 시퀀스를 요약하거나 결합해 단일값으로 출력하는것 
* ex) 열 전체 합산, 최대값 찾기, 분산 및 평균 구하기 등등

In [3]:
flights = pd.read_csv('data/flights.csv')
flights.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,ORG_AIR,DEST_AIR,SCHED_DEP,DEP_DELAY,AIR_TIME,DIST,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,LAX,SLC,1625,58.0,94.0,590,1905,65.0,0,0
1,1,1,4,UA,DEN,IAD,823,7.0,154.0,1452,1333,-13.0,0,0
2,1,1,4,MQ,DFW,VPS,1305,36.0,85.0,641,1453,35.0,0,0
3,1,1,4,AA,DFW,DCA,1555,7.0,126.0,1192,1935,-7.0,0,0
4,1,1,4,WN,LAX,MCI,1720,48.0,166.0,1363,2225,39.0,0,0


In [4]:
# .agg 메서드 : groupby 객체로 집계를 수행하기 위한 메서드 
# 집계 열과 집계 함수 쌍의 딕셔너리 형태 사용 예시
(flights
     .groupby('AIRLINE')
     .agg({'ARR_DELAY':'mean'})
)

Unnamed: 0_level_0,ARR_DELAY
AIRLINE,Unnamed: 1_level_1
AA,5.542661
AS,-0.833333
B6,8.692593
DL,0.339691
EV,7.034580
...,...
OO,7.593463
UA,7.765755
US,1.681105
VX,5.348884


In [5]:
# 집계 열을 인덱스 연산자에, 집계 함수를 .agg내에 str 형태로 전달하는 형태 예시 
(flights
     .groupby('AIRLINE')
     ['ARR_DELAY']
     .agg('mean')
)

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
        ...   
OO    7.593463
UA    7.765755
US    1.681105
VX    5.348884
WN    6.397353
Name: ARR_DELAY, Length: 14, dtype: float64

In [6]:
# 집계 열을 인덱스 연산자에, 집계 함수를 .agg에 집계함수를 직접 전달하는 형태 예시 
(flights
    .groupby('AIRLINE')
    ['ARR_DELAY']
    .agg(np.mean)
)

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
        ...   
OO    7.593463
UA    7.765755
US    1.681105
VX    5.348884
WN    6.397353
Name: ARR_DELAY, Length: 14, dtype: float64

In [7]:
# 집계 열을 인덱스 연산자에 넣고, .agg를 생략하고 집계함수를 직접 사용 가능
# 대부분의 집계함수를 이렇게 사용 가능하다
(flights
    .groupby('AIRLINE')
    ['ARR_DELAY']
    .mean()
)

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
        ...   
OO    7.593463
UA    7.765755
US    1.681105
VX    5.348884
WN    6.397353
Name: ARR_DELAY, Length: 14, dtype: float64

### There's more...

In [8]:
# .agg에 집계함수가 아닌 함수는 사용이 불가능한 예시
(flights
   .groupby('AIRLINE')
   ['ARR_DELAY']
   .agg(np.sqrt)
)

  result = getattr(ufunc, method)(*inputs, **kwargs)


ValueError: Must produce aggregated value

## Grouping and aggregating with multiple columns and functions

#1 요일별 모든 항공사의 취소된 항공편 수   

In [9]:
flights['CANCELLED'].unique() # 취소된거는 1인가 보다

array([0, 1], dtype=int64)

In [10]:
(flights
    .groupby(['AIRLINE', 'WEEKDAY']) # '요일별' '항공사' 그룹화 
    ['CANCELLED']  # 취소 항공편을 집계할 열로 지정
    .agg('sum') # 취소 항공편이 1 의 값을 가지니 그 값들의 합을 모으면 원하는 결과 도출
)

AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
                    ..
WN       3          18
         4          10
         5           7
         6          10
         7           7
Name: CANCELLED, Length: 98, dtype: int64

#2 요일별 모든 항공사의 취소 또는 우회한 항공편의 수와 비율   

In [11]:
(flights
    .groupby(['AIRLINE', 'WEEKDAY']) # 요일별 모든 항공사 그룹화
    ['CANCELLED', 'DIVERTED'] # 취소와 우회여부를 대해 집계할 열로 지정 
    .agg(['sum', 'mean']) # sum으로 취소, 우회 횟수를 구하고, mean으로 비율 구하여 결과 도출 
)

  (flights


Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,DIVERTED,DIVERTED
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,41,0.032106,6,0.004699
AA,2,9,0.007341,2,0.001631
AA,3,16,0.011949,2,0.001494
AA,4,20,0.015004,5,0.003751
AA,5,18,0.014151,1,0.000786
...,...,...,...,...,...
WN,3,18,0.014118,2,0.001569
WN,4,10,0.007911,4,0.003165
WN,5,7,0.005828,0,0.000000
WN,6,10,0.010132,3,0.003040


#3 각 출발지와 도착지에 대해 총 항공편 수, 취소된 항공편의 수와 비율, 비행시간의 평균과 분산

In [12]:
(flights
    .groupby(['ORG_AIR', 'DEST_AIR']) # 출발지, 도착지 기준 그룹화
    .agg({'CANCELLED':['sum', 'mean', 'size'], # 그룹의 취소 열에 대한 합계, 평균, 크기(몇개의 행이 있는지)
          'AIR_TIME':['mean', 'var']}) # 비행시간 열에 대해 평균과 분산 계산
)

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,CANCELLED,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,ABE,0,0.000000,31,96.387097,45.778495
ATL,ABQ,0,0.000000,16,170.500000,87.866667
ATL,ABY,0,0.000000,19,28.578947,6.590643
ATL,ACY,0,0.000000,6,91.333333,11.466667
ATL,AEX,0,0.000000,40,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,4,0.032787,122,64.059322,11.338331
SFO,STL,0,0.000000,20,198.900000,101.042105
SFO,SUN,0,0.000000,10,78.000000,25.777778
SFO,TUS,0,0.000000,20,100.200000,35.221053


In [13]:
# 앞선 방법으로 다중 집계를 시도하면 결과 컬럼이 계층 구조로 만들어짐
# 이를 피하려면 다음과 같이 NamedAgg를 사용하는 방법도 가능
# '새로운 집계 열 정의' = pd.NamedAgg(column='집계 열', aggfunc='집계 함수')  
(flights
    .groupby(['ORG_AIR', 'DEST_AIR'])
    .agg(sum_cancelled=pd.NamedAgg(column='CANCELLED', aggfunc='sum'),
         mean_cancelled=pd.NamedAgg(column='CANCELLED', aggfunc='mean'),
         size_cancelled=pd.NamedAgg(column='CANCELLED', aggfunc='size'),
         mean_air_time=pd.NamedAgg(column='AIR_TIME', aggfunc='mean'),
         var_air_time=pd.NamedAgg(column='AIR_TIME', aggfunc='var'))
)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum_cancelled,mean_cancelled,size_cancelled,mean_air_time,var_air_time
ORG_AIR,DEST_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATL,ABE,0,0.000000,31,96.387097,45.778495
ATL,ABQ,0,0.000000,16,170.500000,87.866667
ATL,ABY,0,0.000000,19,28.578947,6.590643
ATL,ACY,0,0.000000,6,91.333333,11.466667
ATL,AEX,0,0.000000,40,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,4,0.032787,122,64.059322,11.338331
SFO,STL,0,0.000000,20,198.900000,101.042105
SFO,SUN,0,0.000000,10,78.000000,25.777778
SFO,TUS,0,0.000000,20,100.200000,35.221053


In [14]:
# 혹은 계층 구조로 만들어진 컬럼을 .to_flat_index 메서드를 사용해 펼칠수 도 있음
res = (flights
    .groupby(['ORG_AIR', 'DEST_AIR'])
    .agg({'CANCELLED':['sum', 'mean', 'size'],
          'AIR_TIME':['mean', 'var']})
)
print(res.columns.to_flat_index()) 
res.columns = ['_'.join(x) for x in
    res.columns.to_flat_index()]
res

Index([ ('CANCELLED', 'sum'), ('CANCELLED', 'mean'), ('CANCELLED', 'size'),
        ('AIR_TIME', 'mean'),   ('AIR_TIME', 'var')],
      dtype='object')


Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED_sum,CANCELLED_mean,CANCELLED_size,AIR_TIME_mean,AIR_TIME_var
ORG_AIR,DEST_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATL,ABE,0,0.000000,31,96.387097,45.778495
ATL,ABQ,0,0.000000,16,170.500000,87.866667
ATL,ABY,0,0.000000,19,28.578947,6.590643
ATL,ACY,0,0.000000,6,91.333333,11.466667
ATL,AEX,0,0.000000,40,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,4,0.032787,122,64.059322,11.338331
SFO,STL,0,0.000000,20,198.900000,101.042105
SFO,SUN,0,0.000000,10,78.000000,25.777778
SFO,TUS,0,0.000000,20,100.200000,35.221053


In [15]:
# 이것도 따로 처리하지 않고 한번에 체인 메서드로 묶어주고 싶다면
# pipe 메서드 사용가능
# pipe 메서드 -> DataFrame이나 Series를 넣으면 DataFrame이나 Series를 리턴하는 function을 적용시키는 메서드
def flatten_cols(df):
    df.columns = ['_'.join(x) for x in
        df.columns.to_flat_index()]
    return df

In [16]:
(flights
    .groupby(['ORG_AIR', 'DEST_AIR'])
    .agg({'CANCELLED':['sum', 'mean', 'size'],
          'AIR_TIME':['mean', 'var']})
    .pipe(flatten_cols)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED_sum,CANCELLED_mean,CANCELLED_size,AIR_TIME_mean,AIR_TIME_var
ORG_AIR,DEST_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATL,ABE,0,0.000000,31,96.387097,45.778495
ATL,ABQ,0,0.000000,16,170.500000,87.866667
ATL,ABY,0,0.000000,19,28.578947,6.590643
ATL,ACY,0,0.000000,6,91.333333,11.466667
ATL,AEX,0,0.000000,40,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,4,0.032787,122,64.059322,11.338331
SFO,STL,0,0.000000,20,198.900000,101.042105
SFO,SUN,0,0.000000,10,78.000000,25.777778
SFO,TUS,0,0.000000,20,100.200000,35.221053


In [17]:
# 그룹화 하는 컬럼이 category 타입이라면 그룹화 하는 과정에서 모든 조합을 다 출력함
# ex) 아래 예시에선 ATL에서 출발한 비행기가 ABI에 도착하는 노선이 없지만 
# 출발 노선을 카테고리 타입으로 바꿔서 모든 조합을 확인하는 인덱스를 가지는 것을 볼수 있음
# 이는 카디널리티가 높은(unique 수가 많은) 카테고리 타입일수록 양이 폭발적으로 늘어남
res = (flights
    .assign(ORG_AIR=flights.ORG_AIR.astype('category'))
    .groupby(['ORG_AIR', 'DEST_AIR'])
    .agg({'CANCELLED':['sum', 'mean', 'size'],
          'AIR_TIME':['mean', 'var']})
)
res

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,CANCELLED,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,ABE,0,0.0,31,96.387097,45.778495
ATL,ABI,0,,0,,
ATL,ABQ,0,0.0,16,170.500000,87.866667
ATL,ABR,0,,0,,
ATL,ABY,0,0.0,19,28.578947,6.590643
...,...,...,...,...,...,...
SFO,TYS,0,,0,,
SFO,VLD,0,,0,,
SFO,VPS,0,,0,,
SFO,XNA,0,0.0,2,173.500000,0.500000


In [18]:
# 이를 막기 위해 observe=True 매개변수를 사용하면 
# 일반적인 문자열 형태에 대한 그룹화처럼 작동하여 실제로 관측이 가능한 값만 표시된다.
res = (flights
    .assign(ORG_AIR=flights.ORG_AIR.astype('category'))
    .groupby(['ORG_AIR', 'DEST_AIR'], observed=True)
    .agg({'CANCELLED':['sum', 'mean', 'size'],
          'AIR_TIME':['mean', 'var']})
)
res

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,CANCELLED,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
LAX,ABQ,1,0.018182,55,89.259259,29.403215
LAX,ANC,0,0.000000,7,307.428571,78.952381
LAX,ASE,1,0.038462,26,102.920000,102.243333
LAX,ATL,0,0.000000,174,224.201149,127.155837
LAX,AUS,0,0.000000,80,150.537500,57.897310
...,...,...,...,...,...,...
MSP,TTN,1,0.125000,8,124.428571,57.952381
MSP,TUL,0,0.000000,18,91.611111,63.075163
MSP,TUS,0,0.000000,2,176.000000,32.000000
MSP,TVC,0,0.000000,5,56.600000,10.300000


## Removing the MultiIndex after grouping

In [19]:
# 요렇게 다중 인덱스가 발생하면 처리하기 어려움
# Descriptive한 인덱스를 만들어야한다
flights = pd.read_csv('data/flights.csv')
airline_info = (flights
    .groupby(['AIRLINE', 'WEEKDAY'])
    .agg({'DIST':['sum', 'mean'],
          'ARR_DELAY':['min', 'max']}) 
    .astype(int)
)
airline_info

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST,DIST,ARR_DELAY,ARR_DELAY
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
...,...,...,...,...,...
WN,3,997213,782,-38,262
WN,4,1024854,810,-52,284
WN,5,981036,816,-44,244
WN,6,823946,834,-41,290


In [20]:
# get_level_values 메서드로 한 레벨씩 읽어낼수 있음
airline_info.columns.get_level_values(0)

Index(['DIST', 'DIST', 'ARR_DELAY', 'ARR_DELAY'], dtype='object')

In [21]:
airline_info.columns.get_level_values(1)

Index(['sum', 'mean', 'min', 'max'], dtype='object')

In [22]:
# 앞에서 했던 내용 반복
# 다층 구조로 된 열을 풀 떄는 to_flat_index() 쓰면 좋다
airline_info.columns.to_flat_index()

Index([('DIST', 'sum'), ('DIST', 'mean'), ('ARR_DELAY', 'min'),
       ('ARR_DELAY', 'max')],
      dtype='object')

In [23]:
airline_info.columns = ['_'.join(x) for x in
    airline_info.columns.to_flat_index()]
airline_info

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST_sum,DIST_mean,ARR_DELAY_min,ARR_DELAY_max
AIRLINE,WEEKDAY,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
...,...,...,...,...,...
WN,3,997213,782,-38,262
WN,4,1024854,810,-52,284
WN,5,981036,816,-44,244
WN,6,823946,834,-41,290


In [24]:
# 다층 구조의 인덱스를 풀어버리는 제일 쉬운 방법은 reset_index() 메서드를 사용하는 것
airline_info.reset_index()

Unnamed: 0,AIRLINE,WEEKDAY,DIST_sum,DIST_mean,ARR_DELAY_min,ARR_DELAY_max
0,AA,1,1455386,1139,-60,551
1,AA,2,1358256,1107,-52,725
2,AA,3,1496665,1117,-45,473
3,AA,4,1452394,1089,-46,349
4,AA,5,1427749,1122,-41,732
...,...,...,...,...,...,...
93,WN,3,997213,782,-38,262
94,WN,4,1024854,810,-52,284
95,WN,5,981036,816,-44,244
96,WN,6,823946,834,-41,290


In [25]:
# .NamedAgg를 사용할 수 도 있다. 열은 이걸로 풀고 인덱스는 reset_index로 풀어준것에 주목
(flights
    .groupby(['AIRLINE', 'WEEKDAY'])
    .agg(dist_sum=pd.NamedAgg(column='DIST', aggfunc='sum'),
         dist_mean=pd.NamedAgg(column='DIST', aggfunc='mean'),
         arr_delay_min=pd.NamedAgg(column='ARR_DELAY', aggfunc='min'),
         arr_delay_max=pd.NamedAgg(column='ARR_DELAY', aggfunc='max'))
    .astype(int)
    .reset_index()
)

Unnamed: 0,AIRLINE,WEEKDAY,dist_sum,dist_mean,arr_delay_min,arr_delay_max
0,AA,1,1455386,1139,-60,551
1,AA,2,1358256,1107,-52,725
2,AA,3,1496665,1117,-45,473
3,AA,4,1452394,1089,-46,349
4,AA,5,1427749,1122,-41,732
...,...,...,...,...,...,...
93,WN,3,997213,782,-38,262
94,WN,4,1024854,810,-52,284
95,WN,5,981036,816,-44,244
96,WN,6,823946,834,-41,290


In [26]:
# 멀티 인덱스를 처음부터 안 만드는 방법은 as_index 매개변수를 False로 설정하면된다.
(flights
    .groupby(['AIRLINE'], as_index=False)
    ['DIST']
    .agg('mean')
    .round(0) # 정수레벨로 반올림
)

Unnamed: 0,AIRLINE,DIST
0,AA,1114.0
1,AS,1066.0
2,B6,1772.0
3,DL,866.0
4,EV,460.0
...,...,...
9,OO,511.0
10,UA,1231.0
11,US,1181.0
12,VX,1240.0


## Grouping with a custom aggregation function

* 이미 만들어진 집계함수 외에 다른 집계를 수행해야 하는 경우가 발생할 수 있다.
* college 데이터셋 에서 주(state)당 학부생 수의 평균과 표준편차를 계산하고자 하고, 이 정보를 사용해 각 주별 평균들에서의 최대표준편차를 찾는 예시

In [27]:
# 각 주 별로 학부생 수의 평균과 표준편차를 찾음
college = pd.read_csv('data/college.csv')

In [28]:
(college
    .groupby('STABBR') # 아마도 주 
    ['UGDS'] # 아마도 Undergraduated students? 학부생 수
    .agg(['mean', 'std'])
    .round(0) 
)
# 요건 주 별 전체기관들의 학부생 수 평균과 표준편차
# 찾고자 하는건 각 기관들의 편차를 해당 주의 표준편차와 비교했을때 최고 비율을 구하고자 하는것
# 각각의 주를 표준화 시켜서 어떤 주가 최대 편차값을 가지고 있는지 비교하고자한다는것

Unnamed: 0_level_0,mean,std
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,2493.0,4052.0
AL,2790.0,4658.0
AR,1644.0,3143.0
AS,1276.0,
AZ,4130.0,14894.0
...,...,...
VT,1513.0,2194.0
WA,2271.0,4124.0
WI,2655.0,4615.0
WV,1758.0,5957.0


In [29]:
# 그래서 각 주별로 평균에서의 최대 표준 편차를 구하는 집계 함수 정의
def max_deviation(s):
    std_score = (s - s.mean()) / s.std()
    return std_score.abs().max()

In [30]:
# 요렇게 정의한 집계함수를 써먹을 수 있다는 것
(college
    .groupby('STABBR')
    ['UGDS']
    .agg(max_deviation)
    .round(1)
)

STABBR
AK    2.6
AL    5.8
AR    6.3
AS    NaN
AZ    9.9
     ... 
VT    3.8
WA    6.6
WI    5.8
WV    7.2
WY    2.8
Name: UGDS, Length: 59, dtype: float64

In [31]:
# 여러개의 열에다가 적용시켜도 잘 작동한다. 
# 다만 AK 주의 UGDS 처럼 수치적으로 의미가 있어야 잘동작하고 아니면 NaN
(college
    .groupby('STABBR')
    ['UGDS', 'SATVRMID', 'SATMTMID']
    .agg(max_deviation)
    .round(1)
)

  (college


Unnamed: 0_level_0,UGDS,SATVRMID,SATMTMID
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,2.6,,
AL,5.8,1.6,1.8
AR,6.3,2.2,2.3
AS,,,
AZ,9.9,1.9,1.4
...,...,...,...
VT,3.8,1.9,1.9
WA,6.6,2.2,2.0
WI,5.8,2.4,2.2
WV,7.2,1.7,2.1


In [32]:
college[college['STABBR'] == 'AK'].loc[:,['UGDS','SATVRMID','SATMTMID']]

Unnamed: 0,UGDS,SATVRMID,SATMTMID
60,12865.0,,
61,27.0,,
62,5536.0,,
63,1428.0,,
64,275.0,555.0,503.0
65,889.0,,
66,3256.0,,
67,479.0,,
5171,109.0,,
5417,68.0,,


In [33]:
# 기존의 집계 함수들과도 같이 쓸수 있다.
(college
    .groupby(['STABBR', 'RELAFFIL']) 
    ['UGDS', 'SATVRMID', 'SATMTMID'] 
    .agg([max_deviation, 'mean', 'std'])
    .round(1)
)

  (college


Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,UGDS,SATVRMID,SATVRMID,SATVRMID,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,max_deviation,mean,std,max_deviation,mean,std,max_deviation,mean,std
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,,,,,
AK,1,1.1,123.3,132.9,,555.0,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,514.9,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,498.0,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,481.1,37.9,2.0,503.6,39.0
...,...,...,...,...,...,...,...,...,...,...
WI,0,5.3,2879.1,5031.5,1.3,558.8,47.5,1.3,591.2,85.7
WI,1,3.4,1716.2,1934.6,2.1,500.1,66.0,1.8,526.6,42.5
WV,0,6.9,1873.9,6271.7,1.6,466.7,27.9,1.8,480.0,27.7
WV,1,1.3,716.4,503.6,1.9,485.7,14.6,1.7,484.8,17.7


In [34]:
# 함수 이름을 열의 이름으로 사용하게 되는데
max_deviation.__name__

'max_deviation'

In [35]:
# 그래서 rename메서드나 내부메서드를 통해 함수 이름을 바꿔주면 
# 출력 열의 이름도 바뀌는 것을 확인할 수 있다.
max_deviation.__name__ = 'Max Deviation!!!!'
(college
    .groupby(['STABBR', 'RELAFFIL']) 
    ['UGDS', 'SATVRMID', 'SATMTMID'] 
    .agg([max_deviation, 'mean', 'std'])
    .round(1)
)

  (college


Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,UGDS,SATVRMID,SATVRMID,SATVRMID,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,Max Deviation!!!!,mean,std,Max Deviation!!!!,mean,std,Max Deviation!!!!,mean,std
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,,,,,
AK,1,1.1,123.3,132.9,,555.0,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,514.9,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,498.0,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,481.1,37.9,2.0,503.6,39.0
...,...,...,...,...,...,...,...,...,...,...
WI,0,5.3,2879.1,5031.5,1.3,558.8,47.5,1.3,591.2,85.7
WI,1,3.4,1716.2,1934.6,2.1,500.1,66.0,1.8,526.6,42.5
WV,0,6.9,1873.9,6271.7,1.6,466.7,27.9,1.8,480.0,27.7
WV,1,1.3,716.4,503.6,1.9,485.7,14.6,1.7,484.8,17.7


## Customizing aggregating functions with *args and **kwargs

In [36]:
# 직접 커스텀한 집계함수에 매개변수를 필요로 할 수 도 있음
# 아래 함수는 학부생 비율이 1000~3000사이인 학교의 비율을 반환하는 함수
def pct_between_1_3k(s):
    return (s
        .between(1_000, 3_000)
        .mean()
        * 100
    )

In [37]:
(college
    .groupby(['STABBR', 'RELAFFIL'])
    ['UGDS'] 
    .agg(pct_between_1_3k)
    .round(1)
)

STABBR  RELAFFIL
AK      0           14.3
        1            0.0
AL      0           23.6
        1           33.3
AR      0           27.9
                    ... 
WI      0           13.8
        1           36.0
WV      0           24.6
        1           37.5
WY      0           54.5
Name: UGDS, Length: 112, dtype: float64

In [38]:
# 학부생 비율의 하한과 상한을 사용자가 지정할 수 있게 만든 함수
def pct_between(s, low, high):
    return s.between(low, high).mean() * 100

In [39]:
# .agg 메서드의 구조는 agg(func, *arg, **karg)
(college
    .groupby(['STABBR', 'RELAFFIL'])
    ['UGDS'] 
    .agg(pct_between, 1_000, 10_000)
    .round(1)
)

STABBR  RELAFFIL
AK      0           42.9
        1            0.0
AL      0           45.8
        1           37.5
AR      0           39.7
                    ... 
WI      0           31.0
        1           44.0
WV      0           29.2
        1           37.5
WY      0           72.7
Name: UGDS, Length: 112, dtype: float64

In [40]:
# 매개변수를 직접 열에 출력하고 싶다면 다음과 같이 파이썬의 클로져 기능을 사용할 수 있음
def between_n_m(n, m): 
    def wrapper(ser): # pct_between을 커버하는 wrapper 함수 정의
        return pct_between(ser, n, m)
    # 함수의 name 속성을 함수 이름을 매개변수를 포함하도록 지정
    wrapper.__name__ = f'between_{n}_{m}'
    # wrapper를 리턴 -> wrapper는 pct_between을 return 하므로 사실상 pct_between을 return하는것이다.
    return wrapper

In [41]:
(college
    .groupby(['STABBR', 'RELAFFIL'])
    ['UGDS'] 
    .agg([between_n_m(1_000, 10_000), 'max', 'mean'])
    .round(1)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,between_1000_10000,max,mean
STABBR,RELAFFIL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,0,42.9,12865.0,3508.9
AK,1,0.0,275.0,123.3
AL,0,45.8,29851.0,3248.8
AL,1,37.5,3033.0,979.7
AR,0,39.7,21405.0,1793.7
...,...,...,...,...
WI,0,31.0,29302.0,2879.1
WI,1,44.0,8212.0,1716.2
WV,0,29.2,44924.0,1873.9
WV,1,37.5,1375.0,716.4


## Examining the groupby object

### How to do it...

In [42]:
college = pd.read_csv('data/college.csv')
grouped = college.groupby(['STABBR', 'RELAFFIL'])
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [43]:
# groupby 객체의 속성 종류
print([attr for attr in dir(grouped) if not
    attr.startswith('_')])

['CITY', 'CURROPER', 'DISTANCEONLY', 'GRAD_DEBT_MDN_SUPP', 'HBCU', 'INSTNM', 'MD_EARN_WNE_P10', 'MENONLY', 'PCTFLOAN', 'PCTPELL', 'PPTUG_EF', 'RELAFFIL', 'SATMTMID', 'SATVRMID', 'STABBR', 'UG25ABV', 'UGDS', 'UGDS_2MOR', 'UGDS_AIAN', 'UGDS_ASIAN', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_NHPI', 'UGDS_NRA', 'UGDS_UNKN', 'UGDS_WHITE', 'WOMENONLY', 'agg', 'aggregate', 'all', 'any', 'apply', 'backfill', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'ewm', 'expanding', 'ffill', 'fillna', 'filter', 'first', 'get_group', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'mad', 'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pad', 'pct_change', 'pipe', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sample', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', 'take', 'transform', 'tshift', 'value_counts', 'var']


In [44]:
# groupby로 만들어진 그룹 개수
grouped.ngroups

112

In [45]:
# .groups를 통해서 각 group들을 딕셔너리 형태로 추출할 수 있다.
groups = list(grouped.groups)
len(groups[:]) # 요건 키 값들 나온거임

112

In [46]:
grouped.groups[('IL', 1)]

Int64Index([ 986,  992, 1004, 1009, 1019, 1024, 1030, 1031, 1034, 1038, 1043,
            1047, 1052, 1064, 1077, 1078, 1082, 1083, 1085, 1088, 1089, 1091,
            1092, 1095, 1096, 1097, 1105, 1106, 1107, 1114, 1119, 1129, 1131,
            1132, 1133, 1134, 1140, 1145, 1150, 1153, 1154, 1159, 1161, 1828,
            4321, 4369, 5427, 5482, 5606, 5684, 6624, 6650, 7278, 7349, 7477,
            7478, 7479],
           dtype='int64')

In [47]:
grouped.get_group(('AK', 0)) # get_group 메서드는 단일 그룹을 데이터프레임 형태로 추출

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
60,Universi...,Anchorage,AK,0.0,0.0,0.0,0,...,0.4539,1,0.2385,0.2647,0.4386,42500,19449.5
62,Universi...,Fairbanks,AK,0.0,0.0,0.0,0,...,0.3887,1,0.2263,0.255,0.4519,36200,19355
63,Universi...,Juneau,AK,0.0,0.0,0.0,0,...,0.5112,1,0.1769,0.1996,0.555,37400,16875
65,AVTEC-Al...,Seward,AK,0.0,0.0,0.0,0,...,0.6817,1,0.0737,0.0664,0.7127,33500,PrivacyS...
66,Charter ...,Anchorage,AK,0.0,0.0,0.0,0,...,0.0,1,0.8307,0.7503,0.5472,39200,13875
67,Alaska C...,Anchorage,AK,0.0,0.0,0.0,0,...,0.0,1,0.7078,0.786,0.5612,28700,8994
5171,Ilisagvi...,Barrow,AK,0.0,0.0,0.0,0,...,0.6239,1,0.1323,0.0,0.6498,24900,PrivacyS...


In [48]:
# 꿀팁? 하나의 셀에서 dataframe 테이블을 여러개 출력하고 싶다면 display를 활용
from IPython.display import display
for name, group in grouped:
    print(name)
    display(group.head(3))

('AK', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
60,Universi...,Anchorage,AK,0.0,0.0,0.0,0,...,0.4539,1,0.2385,0.2647,0.4386,42500,19449.5
62,Universi...,Fairbanks,AK,0.0,0.0,0.0,0,...,0.3887,1,0.2263,0.255,0.4519,36200,19355.0
63,Universi...,Juneau,AK,0.0,0.0,0.0,0,...,0.5112,1,0.1769,0.1996,0.555,37400,16875.0


('AK', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
61,Alaska B...,Palmer,AK,0.0,0.0,0.0,1,...,0.1481,1,0.3571,0.2857,0.4286,,PrivacyS...
64,Alaska P...,Anchorage,AK,0.0,0.0,0.0,1,...,0.3745,1,0.3152,0.5297,0.491,47000.0,23250
5417,Alaska C...,Soldotna,AK,0.0,0.0,0.0,1,...,0.0735,1,0.8868,0.6792,0.2264,,PrivacyS...


('AL', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama ...,Normal,AL,1.0,0.0,0.0,0,...,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,Universi...,Birmingham,AL,0.0,0.0,0.0,0,...,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
3,Universi...,Huntsville,AL,0.0,0.0,0.0,0,...,0.2146,1,0.3072,0.4596,0.264,45500,24097.0


('AL', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2,Amridge ...,Montgomery,AL,0.0,0.0,0.0,1,...,0.4536,1,0.6801,0.7795,0.854,40100,23370
10,Birmingh...,Birmingham,AL,0.0,0.0,0.0,1,...,0.0017,1,0.192,0.4809,0.0152,44200,27000
12,Concordi...,Selma,AL,1.0,0.0,0.0,1,...,0.1056,1,0.8667,0.9333,0.2367,19900,PrivacyS...


('AR', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
128,Universi...,Little Rock,AR,0.0,0.0,0.0,0,...,0.4126,1,0.3941,0.4775,0.4062,33900,21736
129,Universi...,Little Rock,AR,0.0,0.0,0.0,0,...,0.2433,1,0.3944,0.6144,0.5133,61400,12500
130,ABC Beau...,Arkadelphia,AR,0.0,0.0,0.0,0,...,0.2105,1,0.9815,1.0,0.4688,PrivacyS...,16500


('AR', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
131,Arkansas...,Little Rock,AR,1.0,0.0,0.0,1,...,0.1127,1,0.8306,0.8695,0.2833,22000,38000.0
134,Lyon Col...,Batesville,AR,0.0,0.0,0.0,1,...,0.0101,1,0.4578,0.674,0.0524,38600,25000.0
144,Baptist ...,Little Rock,AR,0.0,0.0,0.0,1,...,0.0848,1,0.5033,0.7266,0.3791,43200,13393.5


('AS', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4138,American...,Pago Pago,AS,0.0,0.0,0.0,0,...,0.4389,1,0.7245,0.0,0.1774,19800,PrivacyS...


('AZ', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
69,Collins ...,Phoenix,AZ,0.0,0.0,0.0,0,...,0.3373,0,0.7205,0.8228,0.4764,25700,47000
71,Empire B...,Tucson,AZ,0.0,0.0,0.0,0,...,0.2222,1,0.7962,0.6615,0.4229,18200,9833
72,Thunderb...,Glendale,AZ,0.0,0.0,0.0,0,...,1.0,0,0.0,0.0,0.0,118900,PrivacyS...


('AZ', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
68,Everest ...,Phoenix,AZ,0.0,0.0,0.0,1,...,0.4749,0,0.8291,0.7151,0.67,28600,9500
70,Empire B...,Phoenix,AZ,0.0,0.0,0.0,1,...,0.16,0,0.6349,0.5873,0.4651,17800,9588
73,American...,Phoenix,AZ,0.0,0.0,0.0,1,...,0.0787,0,0.75,0.5375,0.4684,PrivacyS...,PrivacyS...


('CA', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
192,Academy ...,San Fran...,CA,0.0,0.0,0.0,0,...,0.4334,1,0.4008,0.5524,0.4043,36000.0,35093
193,ITT Tech...,Rancho C...,CA,0.0,0.0,0.0,0,...,0.254,0,0.7137,0.7667,0.7235,38800.0,25827.5
194,Academy ...,Oakland,CA,0.0,0.0,0.0,0,...,,1,,,,,PrivacyS...


('CA', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
200,American...,Berkeley,CA,0.0,0.0,0.0,1,...,,1,,,,,PrivacyS...
210,Azusa Pa...,Azusa,CA,0.0,0.0,0.0,1,...,0.0875,1,0.2898,0.5172,0.1467,50000,22500
214,Bethesda...,Anaheim,CA,0.0,0.0,0.0,1,...,0.1609,1,0.3686,0.2078,0.4672,PrivacyS...,PrivacyS...


('CO', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
565,The Salo...,Grand Ju...,CO,0.0,0.0,0.0,0,...,0.0,1,0.3806,0.4925,0.2778,PrivacyS...,9570
566,Adams St...,Alamosa,CO,0.0,0.0,0.0,0,...,0.1474,1,0.5175,0.6305,0.2106,32800,16255
567,Aims Com...,Greeley,CO,0.0,0.0,0.0,0,...,0.5246,1,0.4105,0.3197,0.3941,31400,8773


('CO', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
575,Colorado...,Lakewood,CO,0.0,0.0,0.0,1,...,0.5835,1,0.3087,0.5214,0.45,36900.0,25808
589,Prince I...,Westminster,CO,0.0,0.0,0.0,1,...,,0,0.6923,0.9487,0.8824,33400.0,20992
592,Denver S...,Littleton,CO,0.0,0.0,0.0,1,...,,1,,,,,PrivacyS...


('CT', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
629,Paul Mit...,Danbury,CT,0.0,0.0,0.0,0,...,0.2774,1,0.4078,0.5475,0.2913,19000,10486
630,Asnuntuc...,Enfield,CT,0.0,0.0,0.0,0,...,0.4795,1,0.379,0.095,0.3959,30900,5500
631,Branford...,Branford,CT,0.0,0.0,0.0,0,...,0.3002,1,0.7103,0.7351,0.5725,27900,9800


('CT', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
628,Albertus...,New Haven,CT,0.0,0.0,0.0,1,...,0.0825,1,0.442,0.8002,0.5133,52100.0,27763.5
645,Fairfiel...,Fairfield,CT,0.0,0.0,0.0,1,...,0.0406,1,0.1322,0.5092,0.0604,68500.0,26852.5
652,Holy Apo...,Cromwell,CT,0.0,0.0,0.0,1,...,0.3621,1,0.1379,0.2241,0.7241,,PrivacyS...


('DC', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
698,Universi...,Washington,DC,1.0,0.0,0.0,0,...,0.5554,1,0.5401,0.4832,0.5662,34800,22393.5
700,Gallaude...,Washington,DC,0.0,0.0,0.0,0,...,0.05,1,0.5278,0.388,0.2451,26000,17750.0
701,George W...,Washington,DC,0.0,0.0,0.0,0,...,0.0666,1,0.142,0.4003,0.0783,65400,25350.0


('DC', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
696,American...,Washington,DC,0.0,0.0,0.0,1,...,0.0293,1,0.1666,0.5213,0.0252,55900.0,24589
697,Catholic...,Washington,DC,0.0,0.0,0.0,1,...,0.053,1,0.1252,0.5615,0.094,53900.0,26000
699,Pontific...,Washington,DC,0.0,0.0,0.0,1,...,,1,,,,,PrivacyS...


('DE', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
685,Margaret...,Lewes,DE,0.0,0.0,0.0,0,...,0.0,1,0.4545,0.4727,0.4909,PrivacyS...,PrivacyS...
686,Dawn Car...,Wilmington,DE,0.0,0.0,0.0,0,...,0.0,1,0.6677,0.7505,0.6003,22400,9500
688,Delaware...,Dover,DE,0.0,0.0,0.0,0,...,0.5365,1,0.4606,0.2013,0.4075,30700,8000


('DE', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
687,Delaware...,Georgetown,DE,0.0,0.0,0.0,1,...,0.5327,1,0.4674,0.1411,0.3561,28800,6750
689,Delaware...,Wilmington,DE,0.0,0.0,0.0,1,...,0.6461,1,0.3511,0.1504,0.3842,34000,7508
694,Wesley C...,Dover,DE,0.0,0.0,0.0,1,...,0.0342,1,0.501,0.8632,0.1319,41600,31000


('FL', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
709,Wyotech-...,Ormond B...,FL,0.0,0.0,0.0,0,...,0.0,1,0.6402,0.6978,0.598,31800,11600
710,The Art ...,Fort Lau...,FL,0.0,0.0,0.0,0,...,0.3692,1,0.6263,0.7942,0.4132,28800,29983
711,Atlantic...,Coconut ...,FL,0.0,0.0,0.0,0,...,0.0,1,0.1363,0.0,0.5044,31900,PrivacyS...


('FL', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
712,The Bapt...,Graceville,FL,0.0,0.0,0.0,1,...,0.2291,1,0.5878,0.5602,0.3531,30800.0,20052
713,Barry Un...,Miami,FL,0.0,0.0,0.0,1,...,0.1518,1,0.5045,0.6733,0.4361,44100.0,28250
714,Gooding ...,Panama City,FL,0.0,0.0,0.0,1,...,,0,,,,,PrivacyS...


('FM', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4214,College ...,Pohnpei,FM,0.0,0.0,0.0,0,...,0.3157,1,0.8778,0.0,0.1631,15700,PrivacyS...


('GA', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
860,Abraham ...,Tifton,GA,0.0,0.0,0.0,0,...,0.2441,1,0.4629,0.4493,0.1523,32000,15085.5
862,Interact...,Chamblee,GA,0.0,0.0,0.0,0,...,0.0174,1,0.562,0.4236,0.7937,21100,7376.0
863,Interact...,Morrow,GA,0.0,0.0,0.0,0,...,0.0,1,0.6129,0.6452,0.7778,21100,7376.0


('GA', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
785,Luther R...,Lithonia,GA,0.0,0.0,0.0,1,...,0.8224,1,0.5832,0.5362,0.8748,39400,29500
861,Agnes Sc...,Decatur,GA,0.0,0.0,1.0,1,...,0.005,1,0.4404,0.647,0.0459,38800,27000
867,Andrew C...,Cuthbert,GA,0.0,0.0,0.0,1,...,0.0034,1,0.6139,0.8544,0.0095,27500,12875


('GU', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4139,Guam Com...,Mangilao,GU,0.0,0.0,0.0,0,...,0.5453,1,0.6354,0.0,0.3058,22000,PrivacyS...
4140,Universi...,Mangilao,GU,0.0,0.0,0.0,0,...,0.2265,1,0.5269,0.276,0.2064,29900,PrivacyS...


('GU', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
5289,Pacific ...,Mangilao,GU,0.0,0.0,0.0,1,...,0.1846,1,0.973,0.0,0.2533,PrivacyS...,PrivacyS...


('HI', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
951,Universi...,Hilo,HI,0.0,0.0,0.0,0,...,0.1733,1,0.4706,0.4515,0.269,33500,19197
952,Universi...,Honolulu,HI,0.0,0.0,0.0,0,...,0.1601,1,0.3015,0.3746,0.1755,43000,19000
953,Hawaii I...,Honolulu,HI,0.0,0.0,0.0,0,...,0.0,1,0.8112,0.3776,0.5529,17300,5868


('HI', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
949,Heald Co...,Honolulu,HI,0.0,0.0,0.0,1,...,0.2198,0,0.7283,0.692,0.5262,35000,11676
950,Chaminad...,Honolulu,HI,0.0,0.0,0.0,1,...,0.1765,1,0.4641,0.4425,0.3237,38400,22000
3805,Brigham ...,Laie,HI,0.0,0.0,0.0,1,...,0.0523,1,0.2415,0.1749,0.2224,41500,8291


('IA', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1247,Allen Co...,Waterloo,IA,0.0,0.0,0.0,0,...,0.3588,1,0.3317,0.8065,0.3945,49100,17090.5
1248,AIB Coll...,Des Moines,IA,0.0,0.0,0.0,0,...,0.249,1,0.4132,0.7125,0.3209,37000,19732.5
1251,Capri Co...,Dubuque,IA,0.0,0.0,0.0,0,...,0.0,1,0.4845,0.5103,0.2295,19400,8477.0


('IA', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1249,Briar Cl...,Sioux City,IA,0.0,0.0,0.0,1,...,0.1971,1,0.4549,0.7975,0.238,38100,24000
1250,Buena Vi...,Storm Lake,IA,0.0,0.0,0.0,1,...,0.1147,1,0.4911,0.8195,0.3999,38300,23877.5
1253,American...,Cedar Ra...,IA,0.0,0.0,0.0,1,...,0.0,0,0.8889,0.8889,0.4545,PrivacyS...,PrivacyS...


('ID', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
965,Carringt...,Boise,ID,0.0,0.0,0.0,0,...,0.0789,1,0.6654,0.5804,0.558,25000,9500
967,Boise St...,Boise,ID,0.0,0.0,0.0,0,...,0.2519,1,0.3464,0.4669,0.3182,35600,23500
968,Eastern ...,Idaho Falls,ID,0.0,0.0,0.0,0,...,0.5037,1,0.5421,0.5393,0.6041,26600,11375


('ID', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
966,Boise Bi...,Boise,ID,0.0,0.0,0.0,1,...,0.093,1,0.6793,0.6576,0.1613,25500,19596
977,Northwes...,Nampa,ID,0.0,0.0,0.0,1,...,0.1085,1,0.3382,0.6081,0.2991,35900,25500
979,Brigham ...,Rexburg,ID,0.0,0.0,0.0,1,...,0.3462,1,0.4733,0.2138,0.371,38800,11000


('IL', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
43,Prince I...,Elmhurst,IL,0.0,0.0,0.0,0,...,0.0,1,0.7857,0.9375,0.6569,PrivacyS...,20992
981,Adler Un...,Chicago,IL,0.0,0.0,0.0,0,...,,1,,,,,PrivacyS...
982,Alvareit...,Edwardsv...,IL,0.0,0.0,0.0,0,...,0.3404,0,0.6364,0.7792,0.3111,PrivacyS...,9911


('IL', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
986,Augustan...,Rock Island,IL,0.0,0.0,0.0,1,...,0.0061,1,0.231,0.6985,0.0115,47900.0,27000
992,Blackbur...,Carlinville,IL,0.0,0.0,0.0,1,...,0.0433,1,0.5064,0.7937,0.0534,37100.0,26000
1004,Catholic...,Chicago,IL,0.0,0.0,0.0,1,...,,1,,,,,PrivacyS...


('IN', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1165,Apex Aca...,Anderson,IN,0.0,0.0,0.0,0,...,0.0,1,0.5,0.3333,0.3333,PrivacyS...,PrivacyS...
1166,Ball Sta...,Muncie,IN,0.0,0.0,0.0,0,...,0.0618,1,0.3399,0.5917,0.0715,38800,25000
1168,Butler U...,Indianap...,IN,0.0,0.0,0.0,0,...,0.0135,1,0.1649,0.5742,0.0185,55000,27000


('IN', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
990,Bethany ...,Richmond,IN,0.0,0.0,0.0,1,...,,1,,,,,PrivacyS...
1163,Ancilla ...,Donaldson,IN,0.0,0.0,0.0,1,...,0.1878,1,0.69,0.74,0.2925,29400.0,17000
1164,Anderson...,Anderson,IN,0.0,0.0,0.0,1,...,0.0863,1,0.2118,0.4688,0.1215,35600.0,27000


('KS', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1326,Allen Co...,Iola,KS,0.0,0.0,0.0,0,...,0.5441,1,0.4001,0.2703,0.2886,29100,6900
1328,Barton C...,Great Bend,KS,0.0,0.0,0.0,0,...,0.3534,1,0.1424,0.0922,0.4148,32200,8976
1332,Brown Ma...,Lenexa,KS,0.0,0.0,0.0,0,...,0.0,1,0.644,0.6975,0.6296,25200,16000


('KS', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1327,Baker Un...,Baldwin ...,KS,0.0,0.0,0.0,1,...,0.3511,1,0.3774,0.7296,0.4418,48800,25250
1329,Benedict...,Atchison,KS,0.0,0.0,0.0,1,...,0.006,1,0.2073,0.5367,0.0208,39600,26000
1330,Bethany ...,Lindsborg,KS,0.0,0.0,0.0,1,...,0.0126,1,0.3988,0.1533,0.0316,38100,27000


('KY', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1389,Alice Ll...,Pippa Pa...,KY,0.0,0.0,0.0,0,...,0.0457,1,0.5737,0.6815,0.046,33500,16495
1390,Asbury U...,Wilmore,KY,0.0,0.0,0.0,0,...,0.1741,1,0.3316,0.7001,0.1448,33600,25250
1392,Ashland ...,Ashland,KY,0.0,0.0,0.0,0,...,0.3288,1,0.5259,0.4076,0.3974,23700,11780


('KY', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1391,Asbury T...,Wilmore,KY,0.0,0.0,0.0,1,...,,1,,,,42500,PrivacyS...
1394,Bellarmi...,Louisville,KY,0.0,0.0,0.0,1,...,0.0536,1,0.2364,0.6213,0.0941,46600,25000
1398,Brescia ...,Owensboro,KY,0.0,0.0,0.0,1,...,0.2403,1,0.4989,0.8219,0.4903,37500,30500


('LA', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1461,Central ...,Alexandria,LA,0.0,0.0,0.0,0,...,0.3929,1,0.3466,0.0,0.4799,PrivacyS...,PrivacyS...
1462,American...,Shreveport,LA,0.0,0.0,0.0,0,...,0.2951,0,0.6032,0.5159,0.8353,19400,9500
1463,Ayers Ca...,Shreveport,LA,0.0,0.0,0.0,0,...,0.0,1,0.9108,0.841,0.6816,25100,9500


('LA', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1470,Centenar...,Shreveport,LA,0.0,0.0,0.0,1,...,0.0091,1,0.3515,0.6092,0.0307,40400,25000.0
1478,Dillard ...,New Orleans,LA,1.0,0.0,0.0,1,...,0.0517,1,0.7422,0.8833,0.0904,32800,35000.0
1492,Louisian...,Pineville,LA,0.0,0.0,0.0,1,...,0.0408,1,0.4319,0.5896,0.1487,39100,23743.5


('MA', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1619,Hult Int...,Cambridge,MA,0.0,0.0,0.0,0,...,,1,,,,,PrivacyS...
1620,New Engl...,Boston,MA,0.0,0.0,0.0,0,...,0.7696,1,0.2574,0.4426,0.8543,,18450
1621,American...,Springfield,MA,0.0,0.0,0.0,0,...,0.0659,1,0.5063,0.848,0.2102,38900.0,27000


('MA', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1623,Andover ...,Newton C...,MA,0.0,0.0,0.0,1,...,,1,,,,,PrivacyS...
1624,Anna Mar...,Paxton,MA,0.0,0.0,0.0,1,...,0.306,1,0.35,0.7725,0.2948,41900.0,25361
1626,Assumpti...,Worcester,MA,0.0,0.0,0.0,1,...,0.0812,1,0.2295,0.7268,0.0781,53600.0,27000


('MD', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1556,Aaron's ...,Waldorf,MD,0.0,0.0,0.0,0,...,0.1786,1,0.7222,0.6481,0.4359,PrivacyS...,PrivacyS...
1557,Aestheti...,Gaithers...,MD,0.0,0.0,0.0,0,...,0.5581,1,0.5833,0.4583,0.65,PrivacyS...,6333
1558,Allegany...,Cumberland,MD,0.0,0.0,0.0,0,...,0.359,1,0.4782,0.4525,0.2946,29300,14072


('MD', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1570,Washingt...,Takoma Park,MD,0.0,0.0,0.0,1,...,0.1443,1,0.5727,0.7854,0.3225,44500,27000
1587,Loyola U...,Baltimore,MD,0.0,0.0,0.0,1,...,0.0086,1,0.1426,0.5335,0.0072,63000,27000
1599,Mount St...,Emmitsburg,MD,0.0,0.0,0.0,1,...,0.046,1,0.2252,0.6215,0.0781,49900,25995


('ME', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1526,Kaplan U...,S Portland,ME,0.0,0.0,0.0,0,...,0.6448,1,0.7368,0.6812,0.752,33400,29493
1527,College ...,Bar Harbor,ME,0.0,0.0,0.0,0,...,0.0219,1,0.3876,0.6152,0.0387,26400,19000
1528,Bates Co...,Lewiston,ME,0.0,0.0,0.0,0,...,0.0,1,0.1167,0.2859,0.0034,51600,16297


('ME', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1535,Husson U...,Bangor,ME,0.0,0.0,0.0,1,...,0.1501,1,0.434,0.7447,0.2332,36900,26250
1549,Saint Jo...,Standish,ME,0.0,0.0,0.0,1,...,0.4766,1,0.2452,0.6548,0.4171,39100,27000
4515,New Engl...,Bangor,ME,0.0,0.0,0.0,1,...,0.1347,1,,,0.1007,27400,27000


('MH', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4561,College ...,Majuro,MH,0.0,0.0,0.0,0,...,0.2727,1,0.8923,0.0,0.231,PrivacyS...,PrivacyS...


('MI', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1224,West Mic...,Kalamazoo,MI,0.0,0.0,0.0,0,...,0.8085,1,0.7807,0.0,0.4368,14800,PrivacyS...
1755,Hillsdal...,Hillsdale,MI,0.0,0.0,0.0,0,...,0.027,1,0.7442,0.7326,0.2,PrivacyS...,PrivacyS...
1756,Northwes...,Southfield,MI,0.0,0.0,0.0,0,...,0.0,1,0.8547,0.8659,0.6478,30200,9500


('MI', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1753,Adrian C...,Adrian,MI,0.0,0.0,0.0,1,...,0.0153,1,0.4142,0.7884,0.0231,37100,27000
1754,Albion C...,Albion,MI,0.0,0.0,0.0,1,...,0.008,1,0.2502,0.6129,0.013,44900,27000
1757,Alma Col...,Alma,MI,0.0,0.0,0.0,1,...,0.0081,1,0.2882,0.907,0.0113,43200,27000


('MN', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
550,Walden U...,Minneapolis,MN,0.0,0.0,0.0,0,...,0.9061,1,0.465,0.6614,0.8741,59700,29125
1863,Academy ...,Bloomington,MN,0.0,0.0,0.0,0,...,0.4034,1,0.5369,0.6913,0.6779,38500,29069
1864,Alexandr...,Alexandria,MN,0.0,0.0,0.0,0,...,0.2249,1,0.2773,0.4067,0.2576,35100,12000


('MN', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1868,Augsburg...,Minneapolis,MN,0.0,0.0,0.0,1,...,0.223,1,0.433,0.7043,0.3108,45700,27000
1872,Bethany ...,Mankato,MN,0.0,0.0,0.0,1,...,0.0318,1,0.3972,0.7599,0.0311,34200,25000
1873,Bethel U...,Saint Paul,MN,0.0,0.0,0.0,1,...,0.1522,1,0.2725,0.6636,0.1991,45000,24069


('MO', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1357,Concorde...,Kansas City,MO,0.0,0.0,0.0,0,...,0.0,1,0.4003,0.4502,0.6181,22100,9500.0
1999,ITT Tech...,Earth City,MO,0.0,0.0,0.0,0,...,0.2456,0,0.8007,0.8339,0.701,38800,25827.5
2001,House of...,Blue Spr...,MO,0.0,0.0,0.0,0,...,0.1765,1,0.4966,0.551,0.3556,11600,9088.5


('MO', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1996,Aquinas ...,Saint Louis,MO,0.0,0.0,0.0,1,...,,1,,,,,PrivacyS...
1997,Assembli...,Springfield,MO,0.0,0.0,0.0,1,...,,1,,,,PrivacyS...,22062
1998,Avila Un...,Kansas City,MO,0.0,0.0,0.0,1,...,0.1794,1,0.4226,0.7059,0.3298,41100,26625


('MP', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4141,Northern...,Saipan,MP,0.0,0.0,0.0,0,...,0.1509,1,0.9125,0.0,0.2002,19600,PrivacyS...


('MS', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1956,Alcorn S...,Alcorn S...,MS,1.0,0.0,0.0,0,...,0.145,1,0.7646,0.8423,0.254,30400,28000
1959,Chris Be...,Gulfport,MS,0.0,0.0,0.0,0,...,0.0,1,0.5649,0.0,0.299,15300,PrivacyS...
1960,Coahoma ...,Clarksdale,MS,1.0,0.0,0.0,0,...,0.0443,1,0.8843,0.0,0.302,21100,PrivacyS...


('MS', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1957,Belhaven...,Jackson,MS,0.0,0.0,0.0,1,...,0.4965,1,0.486,0.6792,0.5435,36800,29656
1958,Blue Mou...,Blue Mou...,MS,0.0,0.0,0.0,1,...,0.0743,1,0.5551,0.5691,0.1692,29200,PrivacyS...
1963,Creation...,Tupelo,MS,0.0,0.0,0.0,1,...,0.0,0,0.8113,0.0,0.4902,17900,PrivacyS...


('MT', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2108,Academy ...,Bozeman,MT,0.0,0.0,0.0,0,...,0.0,1,0.449,0.4286,0.2619,PrivacyS...,PrivacyS...
2109,Blackfee...,Browning,MT,0.0,0.0,0.0,0,...,0.1452,1,0.7279,0.0,0.48,15600,PrivacyS...
2110,Butte Ac...,Butte,MT,0.0,0.0,0.0,0,...,0.0,1,0.7455,0.6364,0.4054,PrivacyS...,9500


('MT', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2113,Carroll ...,Helena,MT,0.0,0.0,0.0,1,...,0.0199,1,0.2103,0.5759,0.0741,45500,27000
2121,Universi...,Great Falls,MT,0.0,0.0,0.0,1,...,0.3971,1,0.3834,0.5315,0.4283,30700,24000
2130,Rocky Mo...,Billings,MT,0.0,0.0,0.0,1,...,0.0266,1,0.3381,0.7996,0.1053,38900,25626


('NC', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2642,College ...,Elizabet...,NC,0.0,0.0,0.0,0,...,0.5136,1,0.4029,0.0,0.3617,22300,PrivacyS...
2643,The Art ...,Charlotte,NC,0.0,0.0,0.0,0,...,0.4588,1,0.7099,0.822,0.2754,28800,25167
2644,South Pi...,Polkton,NC,0.0,0.0,0.0,0,...,0.6648,1,0.415,0.0,0.3595,21700,PrivacyS...


('NC', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2647,Barton C...,Wilson,NC,0.0,0.0,0.0,1,...,0.1082,1,0.4664,0.7843,0.2271,36000,27000
2649,Belmont ...,Belmont,NC,0.0,0.0,0.0,1,...,0.0634,1,0.5086,0.7485,0.4347,36000,27000
2650,Bennett ...,Greensboro,NC,1.0,0.0,1.0,1,...,0.0145,1,0.7265,0.8574,0.0235,26900,37000


('ND', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2772,Rasmusse...,Fargo,ND,0.0,0.0,0.0,0,...,0.4453,1,0.5516,0.6948,0.6286,30900,21163
2773,Bismarck...,Bismarck,ND,0.0,0.0,0.0,0,...,0.2874,1,0.2029,0.3516,0.3351,38400,11588
2774,Dickinso...,Dickinson,ND,0.0,0.0,0.0,0,...,0.2063,1,0.2519,0.4479,0.2436,38800,19500


('ND', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2778,Universi...,Jamestown,ND,0.0,0.0,0.0,1,...,0.0323,1,0.2681,0.6408,0.0806,39600,27000
2782,Universi...,Bismarck,ND,0.0,0.0,0.0,1,...,0.1043,1,0.2402,0.6492,0.1698,45100,22722
2792,Trinity ...,Ellendale,ND,0.0,0.0,0.0,1,...,0.0943,1,0.6768,0.9091,0.1515,25500,27592


('NE', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2133,La'James...,Fremont,NE,0.0,0.0,0.0,0,...,0.0,1,0.6863,0.7451,0.2424,15900,PrivacyS...
2134,Bellevue...,Bellevue,NE,0.0,0.0,0.0,0,...,0.1185,1,0.2914,0.4368,0.8125,52600,17188
2136,Bryan Co...,Lincoln,NE,0.0,0.0,0.0,0,...,0.4745,1,0.3477,0.7049,0.3174,50900,24280.5


('NE', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2135,Clarkson...,Omaha,NE,0.0,0.0,0.0,1,...,0.4628,1,0.3564,0.7205,0.4744,47000,26000
2140,Concordi...,Seward,NE,0.0,0.0,0.0,1,...,0.0373,1,0.2129,0.5046,0.0405,36100,26000
2141,Creighto...,Omaha,NE,0.0,0.0,0.0,1,...,0.0357,1,0.1757,0.683,0.0775,57100,23250


('NH', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2183,Colby-Sa...,New London,NH,0.0,0.0,0.0,0,...,0.0266,1,0.355,0.7536,0.0142,38800,27000
2184,Continen...,Hudson,NH,0.0,0.0,0.0,0,...,0.0,1,0.59,0.9,0.1129,23200,9075
2185,Daniel W...,Nashua,NH,0.0,0.0,0.0,0,...,0.1025,1,0.4034,0.7864,0.1377,50500,26999


('NH', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2193,Northeas...,Warner,NH,0.0,0.0,0.0,1,...,0.0,1,0.0,0.0,,,PrivacyS...
2210,Rivier U...,Nashua,NH,0.0,0.0,0.0,1,...,0.4501,1,0.2753,0.6757,0.4104,41700.0,25500
2211,Saint An...,Manchester,NH,0.0,0.0,0.0,1,...,0.0185,1,0.1747,0.7572,0.0146,52800.0,27000


('NJ', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2215,Eastwick...,Hackensack,NJ,0.0,0.0,0.0,0,...,0.3067,1,0.6667,0.8457,0.6667,27300,12519
2216,Atlantic...,Mays Lan...,NJ,0.0,0.0,0.0,0,...,0.5139,1,0.5204,0.1618,0.3129,28100,10005
2217,Fortis I...,Wayne,NJ,0.0,0.0,0.0,0,...,0.0,1,0.6561,0.731,0.328,30400,10305


('NJ', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2221,Bloomfie...,Bloomfield,NJ,0.0,0.0,0.0,1,...,0.0988,1,0.6733,0.8275,0.2044,36100,30500.0
2224,Caldwell...,Caldwell,NJ,0.0,0.0,0.0,1,...,0.1516,1,0.4214,0.6597,0.2186,44400,26040.0
2226,Centenar...,Hacketts...,NJ,0.0,0.0,0.0,1,...,0.0546,1,0.3202,0.6293,0.3138,41100,25437.5


('NM', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
114,Pima Med...,Albuquerque,NM,0.0,0.0,0.0,0,...,0.0,1,0.59,0.663,0.5387,28200,8708
2303,Olympian...,Alamogordo,NM,0.0,0.0,0.0,0,...,0.0,1,0.8292,0.8134,0.4169,17200,11705
2304,Central ...,Albuquerque,NM,0.0,0.0,0.0,0,...,0.6419,1,0.3945,0.1769,0.4726,29500,10000


('NM', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
7419,Computer...,Las Cruces,NM,,,,1,...,,1,,,,21300,14250


('NV', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2170,Academy ...,Las Vegas,NV,0.0,0.0,0.0,0,...,0.0,1,0.5652,0.7283,0.2468,17200,9500.0
2171,Career C...,Sparks,NV,0.0,0.0,0.0,0,...,0.0,1,0.7853,0.8492,0.5845,23800,14020.5
2172,College ...,Las Vegas,NV,0.0,0.0,0.0,0,...,0.7165,1,0.3228,0.1504,0.4493,31700,10500.0


('NV', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
6439,Touro Un...,Henderson,NV,0.0,0.0,0.0,1,...,0.0323,1,0.0,0.2,0.4,,PrivacyS...
7352,Marinell...,Henderson,NV,,,,1,...,,0,,,,21200.0,9796.5


('NY', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
678,Tri-Stat...,New York,NY,0.0,0.0,0.0,0,...,,1,,,,PrivacyS...,PrivacyS...
2334,Vaughn C...,Flushing,NY,0.0,0.0,0.0,0,...,0.2143,1,0.652,0.6792,0.4142,48700,22625
2335,Adelphi ...,Garden City,NY,0.0,0.0,0.0,0,...,0.0913,1,0.3079,0.5982,0.1562,51300,25000


('NY', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2375,Canisius...,Buffalo,NY,0.0,0.0,0.0,1,...,0.0132,1,0.2964,0.6715,0.0373,45700.0,25000.0
2382,Christ t...,East Aurora,NY,0.0,0.0,0.0,1,...,,1,,,,,
2394,Concordi...,Bronxville,NY,0.0,0.0,0.0,1,...,0.0621,1,0.4329,0.7024,0.3393,43200.0,26000.0


('OH', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2796,ETI Tech...,Niles,OH,0.0,0.0,0.0,0,...,0.4737,1,0.9188,0.725,0.6894,22700,13964
2797,The Art ...,Cincinnati,OH,0.0,0.0,0.0,0,...,0.0556,1,0.6053,0.8947,0.3158,29700,PrivacyS...
2798,Miami-Ja...,Independ...,OH,0.0,0.0,0.0,0,...,0.2487,1,0.8447,0.8539,0.6173,26700,22940


('OH', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2803,Alleghen...,Salem,OH,0.0,0.0,0.0,1,...,0.0566,1,0.6744,0.6977,0.0465,PrivacyS...,PrivacyS...
2808,Ashland ...,Ashland,OH,0.0,0.0,0.0,1,...,0.2411,1,0.2855,0.5807,0.307,39000,27000
2812,Baldwin ...,Berea,OH,0.0,0.0,0.0,1,...,0.0995,1,0.341,0.8166,0.1393,44900,27000


('OK', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3009,American...,Oklahoma...,OK,0.0,0.0,0.0,0,...,0.0,1,0.625,0.5625,0.8333,27300,7023
3013,Broken A...,Broken A...,OK,0.0,0.0,0.0,0,...,0.0769,1,0.5758,0.5909,0.3556,16800,9259
3014,Pontotoc...,Ada,OK,0.0,0.0,0.0,0,...,0.283,1,0.5705,0.0,0.4957,28500,PrivacyS...


('OK', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3010,Bacone C...,Muskogee,OK,0.0,0.0,0.0,1,...,0.114,1,0.9392,0.892,0.1648,29700,26350.0
3011,Oklahoma...,Bartlesv...,OK,0.0,0.0,0.0,1,...,0.4796,1,0.3756,0.6845,0.4769,46100,21276.5
3012,Southern...,Bethany,OK,0.0,0.0,0.0,1,...,0.0411,1,0.4515,0.65,0.3551,45800,18750.0


('OR', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3069,Academy ...,Salem,OR,0.0,0.0,0.0,0,...,0.0,1,0.8642,0.9877,0.5536,14800,18519
3070,Abdill C...,Medford,OR,0.0,0.0,0.0,0,...,0.0,1,0.6486,0.5878,0.45,PrivacyS...,9500
3071,Paul Mit...,Portland,OR,0.0,0.0,0.0,0,...,0.0,1,0.3261,0.4402,0.2159,,10194


('OR', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3081,Concordi...,Portland,OR,0.0,0.0,0.0,1,...,0.1368,1,0.4568,0.7284,0.2839,40400,25000
3086,New Hope...,Eugene,OR,0.0,0.0,0.0,1,...,0.1235,1,0.5922,0.6648,0.2346,26400,24921
3087,George F...,Newberg,OR,0.0,0.0,0.0,1,...,0.0974,1,0.3448,0.6464,0.1426,41700,22000


('PA', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3136,Abington...,Willow G...,PA,0.0,0.0,0.0,0,...,0.0,1,0.4286,0.8527,0.6696,63300,15836.0
3137,Jolie Ha...,Hazleton,PA,0.0,0.0,0.0,0,...,0.2593,1,0.726,0.7534,0.433,PrivacyS...,8847.5
3138,Keystone...,Harrisburg,PA,0.0,0.0,0.0,0,...,0.0,1,0.7164,0.7709,0.3578,24400,11677.5


('PA', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3139,Bryn Ath...,Bryn Athyn,PA,0.0,0.0,0.0,1,...,0.0258,1,0.3726,0.635,0.0266,PrivacyS...,22294.5
3141,Albright...,Reading,PA,0.0,0.0,0.0,1,...,0.0043,1,0.461,0.8987,0.2452,45800,28750.0
3144,Alleghen...,Meadville,PA,0.0,0.0,0.0,1,...,0.0066,1,0.2628,0.6742,0.0088,48400,29046.0


('PR', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4142,Institut...,Arecibo,PR,0.0,0.0,0.0,0,...,0.0,1,0.8407,0.0,0.2821,12000,PrivacyS...
4143,Educatio...,Bayamon,PR,0.0,0.0,0.0,0,...,0.0,1,0.8189,0.0,0.2933,14500,PrivacyS...
4144,American...,Bayamon,PR,0.0,0.0,0.0,0,...,0.0889,1,0.8252,0.0699,0.2657,19300,3920


('PR', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4147,Universi...,Mayaguez,PR,0.0,0.0,0.0,1,...,0.0638,1,0.8687,0.5291,0.223,18900,13800
4149,Universi...,Bayamón,PR,0.0,0.0,0.0,1,...,0.3055,1,0.82,0.3063,0.2849,18500,8250
4154,Pontific...,Arecibo,PR,0.0,0.0,0.0,1,...,0.0763,1,0.7511,0.3725,0.2595,17900,13195


('PW', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4215,Palau Co...,Koror,PW,0.0,0.0,0.0,0,...,0.3887,1,0.856,0.0,0.2616,24700,PrivacyS...


('RI', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3402,Brown Un...,Providence,RI,0.0,0.0,0.0,0,...,0.0037,1,0.158,0.2291,0.0112,59700,15500
3403,Bryant U...,Smithfield,RI,0.0,0.0,0.0,0,...,0.0212,1,0.1664,0.6027,0.0216,64500,27000
3404,Johnson ...,Providence,RI,0.0,0.0,0.0,0,...,0.0678,1,0.3374,0.7882,0.1037,35300,27000


('RI', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3401,Empire B...,Providence,RI,0.0,0.0,0.0,1,...,0.2743,0,0.8081,0.6566,0.4667,21000,9833
3408,Providen...,Providence,RI,0.0,0.0,0.0,1,...,0.0587,1,0.1272,0.5562,0.0689,57700,27000
3414,Salve Re...,Newport,RI,0.0,0.0,0.0,1,...,0.0586,1,0.196,0.7631,0.0592,49700,27000


('SC', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3417,Aiken Te...,Granitev...,SC,0.0,0.0,0.0,0,...,0.6123,1,0.5611,0.2446,0.4413,24500,9625
3420,Technica...,Beaufort,SC,0.0,0.0,0.0,0,...,0.7032,1,0.6469,0.1796,0.5035,25300,7500
3422,Bob Jone...,Greenville,SC,0.0,0.0,0.0,0,...,0.0223,1,0.4133,0.3824,0.0384,PrivacyS...,19000


('SC', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3418,Allen Un...,Columbia,SC,1.0,0.0,0.0,1,...,0.0273,1,0.8955,0.9831,0.0783,21100,37676
3419,Charlest...,Charleston,SC,0.0,0.0,0.0,1,...,0.1055,1,0.4718,0.7246,0.2198,35700,27741
3421,Benedict...,Columbia,SC,1.0,0.0,0.0,1,...,0.016,1,0.8288,0.9013,0.0784,21400,44000


('SD', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3479,Black Hi...,Rapid City,SD,0.0,0.0,0.0,0,...,0.0,1,0.3931,0.4162,0.1339,16200,11790
3480,Black Hi...,Spearfish,SD,0.0,0.0,0.0,0,...,0.2416,1,0.3443,0.5587,0.2841,34400,25625
3481,Kilian C...,Sioux Falls,SD,0.0,0.0,0.0,0,...,0.8429,0,0.6838,0.751,0.5455,23100,17125


('SD', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3478,Augustan...,Sioux Falls,SD,0.0,0.0,0.0,1,...,0.0315,1,0.2062,0.6158,0.0424,41800,27000
3483,Dakota W...,Mitchell,SD,0.0,0.0,0.0,1,...,0.1176,1,0.3877,0.7815,0.1309,34500,27000
3486,Avera Mc...,Sioux Falls,SD,0.0,0.0,0.0,1,...,0.0,1,0.05,0.3,0.05,PrivacyS...,PrivacyS...


('TN', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1205,ITT Tech...,Nashville,TN,0.0,0.0,0.0,0,...,0.2047,0,0.7352,0.7219,0.8019,38800,25827.5
3507,Arnolds ...,Milan,TN,0.0,0.0,0.0,0,...,0.0,1,0.6311,0.2913,0.4444,16000,PrivacyS...
3508,Tennesse...,Athens,TN,0.0,0.0,0.0,0,...,0.0,1,0.4252,0.0,0.396,26600,PrivacyS...


('TN', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3506,American...,Nashville,TN,1.0,0.0,0.0,1,...,0.2994,1,0.6014,0.8252,0.7305,PrivacyS...,25000
3510,Baptist ...,Memphis,TN,0.0,0.0,0.0,1,...,0.55,1,0.5653,0.8339,0.5059,54100,30000
3511,Belmont ...,Nashville,TN,0.0,0.0,0.0,1,...,0.062,1,0.1729,0.609,0.0848,41800,22707


('TX', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3611,Alvin Co...,Alvin,TX,0.0,0.0,0.0,0,...,0.7123,1,0.1549,0.0625,0.2841,34500,6750
3612,Amarillo...,Amarillo,TX,0.0,0.0,0.0,0,...,0.6922,1,0.3786,0.1573,0.3431,31700,10950
3613,Angelina...,Lufkin,TX,0.0,0.0,0.0,0,...,0.56,1,0.5308,0.0,0.2603,26900,PrivacyS...


('TX', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3610,Abilene ...,Abilene,TX,0.0,0.0,0.0,1,...,0.0468,1,0.2595,0.5527,0.0381,40200,25985
3615,Arlingto...,Arlington,TX,0.0,0.0,0.0,1,...,0.1682,1,0.4978,0.4892,0.2251,34200,22905
3618,Austin C...,Sherman,TX,0.0,0.0,0.0,1,...,0.0016,1,0.2867,0.7581,0.0124,47800,26000


('UT', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3802,AmeriTec...,Provo,UT,0.0,0.0,0.0,0,...,0.0,1,0.7295,0.8074,0.3526,24700,24370
3803,Bridgerl...,Logan,UT,0.0,0.0,0.0,0,...,0.6525,1,0.2017,0.0,0.4148,24300,PrivacyS...
3806,Broadvie...,West Jordan,UT,0.0,0.0,0.0,0,...,0.3452,1,0.6943,0.8035,0.559,25500,28458


('UT', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3804,Brigham ...,Provo,UT,0.0,0.0,0.0,1,...,0.0981,1,0.3702,0.1921,0.122,57200,11000.0
3817,Latter-d...,Salt Lak...,UT,0.0,0.0,0.0,1,...,0.3365,1,0.3319,0.2144,0.2235,35100,5799.0
3818,Everest ...,West Val...,UT,0.0,0.0,0.0,1,...,0.2811,0,0.703,0.5619,0.5371,24400,10632.5


('VA', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
704,Medtech ...,Falls Ch...,VA,0.0,0.0,0.0,0,...,0.0,1,0.7097,0.4198,0.2039,26300,9236
3850,Bar Palm...,Roanoke,VA,0.0,0.0,0.0,0,...,0.1831,1,0.4367,0.4177,0.6944,16900,9731
3851,Advanced...,Virginia...,VA,0.0,0.0,0.0,0,...,0.0,1,0.3374,0.301,0.5364,38000,16279


('VA', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3852,Averett ...,Danville,VA,0.0,0.0,0.0,1,...,0.0227,1,0.512,0.7879,0.0992,42400,25000
3853,Bluefiel...,Bluefield,VA,0.0,0.0,0.0,1,...,0.1193,1,0.454,0.6989,0.4241,40000,18873
3854,Bridgewa...,Bridgewater,VA,0.0,0.0,0.0,1,...,0.0051,1,0.2754,0.7348,0.0114,40800,27000


('VI', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4216,Universi...,Charlott...,VI,1.0,0.0,0.0,0,...,0.3059,1,0.5161,0.3224,0.3196,31800,15150


('VI', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
7404,Universi...,St. Croix,VI,,,,1,...,,1,,,,31800,15150


('VT', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3829,Benningt...,Bennington,VT,0.0,0.0,0.0,0,...,0.0048,1,0.2342,0.5703,0.0097,24600,27000
3830,Burlingt...,Burlington,VT,0.0,0.0,0.0,0,...,0.0843,0,0.467,0.7123,0.2545,26000,25000
3831,Castleto...,Castleton,VT,0.0,0.0,0.0,0,...,0.091,1,0.2938,0.6399,0.0938,34900,25000


('VT', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3835,Green Mo...,Poultney,VT,0.0,0.0,0.0,1,...,0.0197,1,0.3847,0.7051,0.0407,30100,25449
3843,Saint Mi...,Colchester,VT,0.0,0.0,0.0,1,...,0.0108,1,0.1682,0.6456,0.022,46600,27400
3845,College ...,Rutland,VT,0.0,0.0,0.0,1,...,0.1,1,0.6193,0.7216,0.2557,34700,24127


('WA', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3943,Beauty A...,Wenatchee,WA,0.0,0.0,0.0,0,...,0.0,1,0.6385,0.6154,0.3896,PrivacyS...,8718.5
3944,The Art ...,Seattle,WA,0.0,0.0,0.0,0,...,0.3128,1,0.4272,0.6662,0.3795,34100,25937.5
3945,Evergree...,Bellevue,WA,0.0,0.0,0.0,0,...,0.0,1,0.3314,0.3314,0.44,PrivacyS...,7917.0


('WA', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3967,Gonzaga ...,Spokane,WA,0.0,0.0,0.0,1,...,0.0128,1,0.1832,0.5155,0.0298,53000,25500.0
3981,Trinity ...,Everett,WA,0.0,0.0,0.0,1,...,0.0478,0,0.4948,0.701,0.2165,37100,25000.0
3985,Northwes...,Kirkland,WA,0.0,0.0,0.0,1,...,0.1239,1,0.379,0.6787,0.3067,37700,23724.5


('WI', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4063,Advanced...,Glendale,WI,0.0,0.0,0.0,0,...,0.0,1,0.5089,0.5325,0.1736,24000,10314
4064,VICI Ave...,Greenfield,WI,0.0,0.0,0.0,0,...,0.0,1,0.5069,0.6406,0.2059,24000,10314
4066,Madison ...,Madison,WI,0.0,0.0,0.0,0,...,0.6223,1,0.3259,0.3565,0.508,35000,14250


('WI', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4065,Alverno ...,Milwaukee,WI,0.0,0.0,1.0,1,...,0.2481,1,0.6096,0.8743,0.3464,37100,32606.5
4070,Cardinal...,Milwaukee,WI,0.0,0.0,0.0,1,...,0.0638,1,0.4278,0.7356,0.6632,48500,27000.0
4071,Carroll ...,Waukesha,WI,0.0,0.0,0.0,1,...,0.0838,1,0.2709,0.7137,0.1119,41300,27000.0


('WV', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2842,Scott Co...,Wheeling,WV,0.0,0.0,0.0,0,...,0.0,1,0.5276,0.4961,0.1111,14800,9250
4019,B M Spur...,Glen Dale,WV,0.0,0.0,0.0,0,...,0.0,1,0.4722,0.0,0.4444,PrivacyS...,PrivacyS...
4020,Ben Fran...,Dunbar,WV,0.0,0.0,0.0,0,...,0.7885,1,0.1761,0.0,0.7568,20800,PrivacyS...


('WV', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4016,Alderson...,Philippi,WV,0.0,0.0,0.0,1,...,0.0342,1,0.48,0.8945,0.0722,46000,27000.0
4018,Appalach...,Mount Hope,WV,0.0,0.0,0.0,1,...,0.0612,1,0.4388,0.277,0.0899,28700,9300.0
4027,Davis & ...,Elkins,WV,0.0,0.0,0.0,1,...,0.0167,1,0.4188,0.6579,0.1133,35000,23840.5


('WY', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4128,Casper C...,Casper,WY,0.0,0.0,0.0,0,...,0.325,1,0.2188,0.1455,0.3447,34800,10764
4129,Central ...,Riverton,WY,0.0,0.0,0.0,0,...,0.4377,1,0.17,0.0967,0.3992,25200,8757
4130,Eastern ...,Torrington,WY,0.0,0.0,0.0,0,...,0.3275,1,0.1729,0.1166,0.2371,25900,10000


In [49]:
# 요렇게만 봐도 충분하긴하다
for name, group in grouped:
    print(name)
    print(group)
    break

('AK', 0)
           INSTNM       CITY STABBR  HBCU  MENONLY  WOMENONLY  RELAFFIL  ...  \
60    Universi...  Anchorage     AK   0.0      0.0        0.0         0  ...   
62    Universi...  Fairbanks     AK   0.0      0.0        0.0         0  ...   
63    Universi...     Juneau     AK   0.0      0.0        0.0         0  ...   
65    AVTEC-Al...     Seward     AK   0.0      0.0        0.0         0  ...   
66    Charter ...  Anchorage     AK   0.0      0.0        0.0         0  ...   
67    Alaska C...  Anchorage     AK   0.0      0.0        0.0         0  ...   
5171  Ilisagvi...     Barrow     AK   0.0      0.0        0.0         0  ...   

      PPTUG_EF  CURROPER  PCTPELL  PCTFLOAN  UG25ABV  MD_EARN_WNE_P10  \
60      0.4539         1   0.2385    0.2647   0.4386        42500       
62      0.3887         1   0.2263    0.2550   0.4519        36200       
63      0.5112         1   0.1769    0.1996   0.5550        37400       
65      0.6817         1   0.0737    0.0664   0.7127     

In [50]:
group

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
60,Universi...,Anchorage,AK,0.0,0.0,0.0,0,...,0.4539,1,0.2385,0.2647,0.4386,42500,19449.5
62,Universi...,Fairbanks,AK,0.0,0.0,0.0,0,...,0.3887,1,0.2263,0.255,0.4519,36200,19355
63,Universi...,Juneau,AK,0.0,0.0,0.0,0,...,0.5112,1,0.1769,0.1996,0.555,37400,16875
65,AVTEC-Al...,Seward,AK,0.0,0.0,0.0,0,...,0.6817,1,0.0737,0.0664,0.7127,33500,PrivacyS...
66,Charter ...,Anchorage,AK,0.0,0.0,0.0,0,...,0.0,1,0.8307,0.7503,0.5472,39200,13875
67,Alaska C...,Anchorage,AK,0.0,0.0,0.0,0,...,0.0,1,0.7078,0.786,0.5612,28700,8994
5171,Ilisagvi...,Barrow,AK,0.0,0.0,0.0,0,...,0.6239,1,0.1323,0.0,0.6498,24900,PrivacyS...


In [51]:
# 각 그룹별 하나씩 추출해서 볼수도 있음 
grouped.head(1)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama ...,Normal,AL,1.0,0.0,0.0,0,...,0.0656,1,0.7356,0.8284,0.1049,30300,33888
2,Amridge ...,Montgomery,AL,0.0,0.0,0.0,1,...,0.4536,1,0.6801,0.7795,0.8540,40100,23370
43,Prince I...,Elmhurst,IL,0.0,0.0,0.0,0,...,0.0000,1,0.7857,0.9375,0.6569,PrivacyS...,20992
60,Universi...,Anchorage,AK,0.0,0.0,0.0,0,...,0.4539,1,0.2385,0.2647,0.4386,42500,19449.5
61,Alaska B...,Palmer,AK,0.0,0.0,0.0,1,...,0.1481,1,0.3571,0.2857,0.4286,,PrivacyS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4561,College ...,Majuro,MH,0.0,0.0,0.0,0,...,0.2727,1,0.8923,0.0000,0.2310,PrivacyS...,PrivacyS...
5289,Pacific ...,Mangilao,GU,0.0,0.0,0.0,1,...,0.1846,1,0.9730,0.0000,0.2533,PrivacyS...,PrivacyS...
6439,Touro Un...,Henderson,NV,0.0,0.0,0.0,1,...,0.0323,1,0.0000,0.2000,0.4000,,PrivacyS...
7404,Universi...,St. Croix,VI,,,,1,...,,1,,,,31800,15150


In [52]:
# nth 메서드를 사용하면 각 그룹의 특정 행을 선택할 수 있음 
grouped.nth([1, -1])

Unnamed: 0_level_0,Unnamed: 1_level_0,INSTNM,CITY,HBCU,MENONLY,WOMENONLY,SATVRMID,SATMTMID,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
STABBR,RELAFFIL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AK,0,Universi...,Fairbanks,0.0,0.0,0.0,,,...,0.3887,1,0.2263,0.2550,0.4519,36200,19355
AK,0,Ilisagvi...,Barrow,0.0,0.0,0.0,,,...,0.6239,1,0.1323,0.0000,0.6498,24900,PrivacyS...
AK,1,Alaska P...,Anchorage,0.0,0.0,0.0,555.0,503.0,...,0.3745,1,0.3152,0.5297,0.4910,47000,23250
AK,1,Alaska C...,Soldotna,0.0,0.0,0.0,,,...,0.0735,1,0.8868,0.6792,0.2264,,PrivacyS...
AL,0,Universi...,Birmingham,0.0,0.0,0.0,570.0,565.0,...,0.2607,1,0.3460,0.5214,0.2422,39700,21941.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WV,0,BridgeVa...,South C...,0.0,0.0,0.0,,,...,0.3890,1,,,,,9429.5
WV,1,Appalach...,Mount Hope,0.0,0.0,0.0,514.0,487.0,...,0.0612,1,0.4388,0.2770,0.0899,28700,9300
WV,1,West Vir...,Nutter Fort,,,,,,...,,1,,,,16700,19258
WY,0,Central ...,Riverton,0.0,0.0,0.0,,,...,0.4377,1,0.1700,0.0967,0.3992,25200,8757


## Filtering for states with a minority majority

In [53]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
grouped = college.groupby('STABBR')
grouped.ngroups
# 그룹 갯수는 STABBR의 nunique()와 같아야함

59

In [54]:
college['STABBR'].nunique() 

59

In [55]:
def check_minority(df, threshold):
    minority_pct = 1 - df['UGDS_WHITE'] # 유색 인종 비율
    total_minority = (df['UGDS'] * minority_pct).sum() # 각 기관들 학부생 중 유색인종 수의 총합
    total_ugds = df['UGDS'].sum() # 각 기관들의 학부생 수의 총합
    total_minority_pct = total_minority / total_ugds # 전체 모든 학부생들 중 유색 인종의 비율 
    return total_minority_pct > threshold

In [56]:
minority_pct = 1 - college['UGDS_WHITE'] # 유색 인종 비율
minority_pct

INSTNM
Alabama A & M University                                  0.9667
University of Alabama at Birmingham                       0.4078
Amridge University                                        0.7010
University of Alabama in Huntsville                       0.3012
Alabama State University                                  0.9842
                                                           ...  
SAE Institute of Technology  San Francisco                   NaN
Rasmussen College - Overland Park                            NaN
National Personal Training Institute of Cleveland            NaN
Bay Area Medical Academy - San Jose Satellite Location       NaN
Excel Learning Center-San Antonio South                      NaN
Name: UGDS_WHITE, Length: 7535, dtype: float64

In [57]:
total_minority = (college['UGDS'] * minority_pct).sum() # 각 기관들 학부생 중 유색인종 수의 총합
total_minority

7898501.594700001

In [58]:
total_ugds = college['UGDS'].sum() # 각 기관들의 학부생 수의 총합
total_ugds

16200904.0

In [59]:
total_minority_pct = total_minority / total_ugds # 전체 모든 학부생들 중 유색 인종의 비율 
total_minority_pct

0.4875346211976814

In [60]:
# 여기서 필터의 역할은 threshold 값을 기준으로 각 그룹을 유지할지 여부를 판별
college_filtered = grouped.filter(check_minority, threshold=.5)
college_filtered

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,...,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Everest College-Phoenix,Phoenix,AZ,0.0,0.0,0.0,1,,...,0.4749,0,0.8291,0.7151,0.6700,28600,9500
Collins College,Phoenix,AZ,0.0,0.0,0.0,0,,...,0.3373,0,0.7205,0.8228,0.4764,25700,47000
Empire Beauty School-Paradise Valley,Phoenix,AZ,0.0,0.0,0.0,1,,...,0.1600,0,0.6349,0.5873,0.4651,17800,9588
Empire Beauty School-Tucson,Tucson,AZ,0.0,0.0,0.0,0,,...,0.2222,1,0.7962,0.6615,0.4229,18200,9833
Thunderbird School of Global Management,Glendale,AZ,0.0,0.0,0.0,0,,...,1.0000,0,0.0000,0.0000,0.0000,118900,PrivacyS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WestMed College - Merced,Merced,CA,,,,1,,...,,1,,,,,15623.5
Vantage College,El Paso,TX,,,,1,,...,,1,,,,,9500
SAE Institute of Technology San Francisco,Emeryville,CA,,,,1,,...,,1,,,,,9500
Bay Area Medical Academy - San Jose Satellite Location,San Jose,CA,,,,1,,...,,1,,,,,PrivacyS...


In [61]:
print('college.shape',college.shape)
print('college_filtered.shape',college_filtered.shape)

college.shape (7535, 26)
college_filtered.shape (3028, 26)


In [62]:
college['STABBR'].nunique()

59

In [63]:
college_filtered['STABBR'].nunique()

20

In [64]:
college_filtered_20 = grouped.filter(check_minority, threshold=.2)
college_filtered_20.shape

(7461, 26)

In [65]:
college_filtered_20['STABBR'].nunique()

57

In [66]:
college_filtered_70 = grouped.filter(check_minority, threshold=.7)
college_filtered_70.shape

(957, 26)

In [67]:
college_filtered_70['STABBR'].nunique()

10

## Transforming through a weight loss bet

In [68]:
# 두 사람의 체중 감소를 추적해 승자를 결정짓는 예제
weight_loss = pd.read_csv('data/weight_loss.csv')
weight_loss.query('Month == "Jan"') # weight_loss[weight_loss.Month == 'Jan']

Unnamed: 0,Name,Month,Week,Weight
0,Bob,Jan,Week 1,291
1,Amy,Jan,Week 1,197
2,Bob,Jan,Week 2,288
3,Amy,Jan,Week 2,189
4,Bob,Jan,Week 3,283
5,Amy,Jan,Week 3,189
6,Bob,Jan,Week 4,283
7,Amy,Jan,Week 4,190


In [69]:
# Series 데이터의 체중 비교 함수
def percent_loss(s):
    return ((s - s.iloc[0]) / s.iloc[0]) * 100

In [70]:
(weight_loss
    .query('Name=="Bob" and Month=="Jan"')
    ['Weight']
    .pipe(percent_loss)
)

0    0.000000
2   -1.030928
4   -2.749141
6   -2.749141
Name: Weight, dtype: float64

In [71]:
# groupby의 transform -> 각각의 그룹에 대해 apply
# 기존 인덱스 유지
(weight_loss
    .groupby(['Name', 'Month'])
    ['Weight'] 
    .transform(percent_loss)
)

0     0.000000
1     0.000000
2    -1.030928
3    -4.060914
4    -2.749141
        ...   
27   -3.529412
28   -3.065134
29   -3.529412
30   -4.214559
31   -5.294118
Name: Weight, Length: 32, dtype: float64

In [72]:
# assign -> 열추가 -> .assign(열이름 = (Series 데이터))
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Name=="Bob" and Month in ["Jan", "Feb"]')
)

Unnamed: 0,Name,Month,Week,Weight,percent_loss
0,Bob,Jan,Week 1,291,0.0
2,Bob,Jan,Week 2,288,-1.0
4,Bob,Jan,Week 3,283,-2.7
6,Bob,Jan,Week 4,283,-2.7
8,Bob,Feb,Week 1,283,0.0
10,Bob,Feb,Week 2,275,-2.8
12,Bob,Feb,Week 3,268,-5.3
14,Bob,Feb,Week 4,268,-5.3


In [73]:
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Week == "Week 4"') # 각 월에 마지막 주의 데이터만 가져옴
)

Unnamed: 0,Name,Month,Week,Weight,percent_loss
6,Bob,Jan,Week 4,283,-2.7
7,Amy,Jan,Week 4,190,-3.6
14,Bob,Feb,Week 4,268,-5.3
15,Amy,Feb,Week 4,173,-8.9
22,Bob,Mar,Week 4,261,-2.6
23,Amy,Mar,Week 4,170,-1.7
30,Bob,Apr,Week 4,250,-4.2
31,Amy,Apr,Week 4,161,-5.3


In [74]:
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Week == "Week 4"')
    .pivot(index='Month', columns='Name',
           values='percent_loss') # pivot으로 데이터 테이블 형태 변경
)

Name,Amy,Bob
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Apr,-5.3,-4.2
Feb,-8.9,-5.3
Jan,-3.6,-2.7
Mar,-1.7,-2.6


In [75]:
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Week == "Week 4"')
    .pivot(index='Month', columns='Name',
           values='percent_loss')
    .assign(winner=lambda df_:
            np.where(df_.Amy < df_.Bob, 'Amy', 'Bob'))
) # 변경한 테이블에 winner column 추가

Name,Amy,Bob,winner
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apr,-5.3,-4.2,Amy
Feb,-8.9,-5.3,Amy
Jan,-3.6,-2.7,Amy
Mar,-1.7,-2.6,Bob


In [77]:
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Week == "Week 4"')
    .pivot(index='Month', columns='Name',
           values='percent_loss')
    .assign(winner=lambda df_:
            np.where(df_.Amy < df_.Bob, 'Amy', 'Bob'))
    .style.highlight_min(subset = ['Amy', 'Bob'],axis=1)
) # 최소값 강조

Name,Amy,Bob,winner
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apr,-5.3,-4.2,Amy
Feb,-8.9,-5.3,Amy
Jan,-3.6,-2.7,Amy
Mar,-1.7,-2.6,Bob


In [78]:
# 이긴 횟수 비교
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Week == "Week 4"')
    .pivot(index='Month', columns='Name',
           values='percent_loss')
    .assign(winner=lambda df_:
            np.where(df_.Amy < df_.Bob, 'Amy', 'Bob'))
    .winner
    .value_counts()
)

Amy    3
Bob    1
Name: winner, dtype: int64

In [79]:
# pivot의 기능을 unstack으로 따라해본거라고 합니다.
# groupby로 이중 인덱스 만들고 unstack으로 풀어버린것
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Week == "Week 4"')
    .groupby(['Month', 'Name'])
    ['percent_loss']
    .first()
    .unstack()
)

Name,Amy,Bob
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Apr,-5.3,-4.2
Feb,-8.9,-5.3
Jan,-3.6,-2.7
Mar,-1.7,-2.6


### There's more...

In [80]:
# 요건 Categorical 데이터가 순서를 지정할 수 있다는 점을 이용해
# Month 열을 category로 만들어 데이터 순서를 부여하여 
# 최종 출력의 Month를 실제 월 순서로 만들어버린 예시
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)),
            Month=pd.Categorical(weight_loss.Month,
                  categories=['Jan', 'Feb', 'Mar', 'Apr'],
                  ordered=True))
    .query('Week == "Week 4"')
    .pivot(index='Month', columns='Name',
           values='percent_loss')
)

Name,Amy,Bob
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,-3.6,-2.7
Feb,-8.9,-5.3
Mar,-1.7,-2.6
Apr,-5.3,-4.2


## Calculating weighted mean SAT scores per state with apply

In [81]:
# 각 그룹에 대해 계산을 수행하는 함수를 받는 4가지 메서드
# agg : 스칼라값 반환
# filter : boolean 반환
# transform : 전달된 그룹과 길이가 같은 Series, DataFrame 반환
# apply : 유연한 return 가능

# agg, transform -> 각 집계 열마다 호출
# apply 그룹당 한번만 호출 

# college 데이터셋에서 주별로 수학과 언어 SAT 점수의 가중 평균을 계산하는 예제 
# 학교별 학부생 수를 이용해 점수 가중
college = pd.read_csv('data/college.csv')
subset = ['UGDS', 'SATMTMID', 'SATVRMID']
college2 = college.dropna(subset=subset)
college.shape

(7535, 27)

In [82]:
college2.shape
# 결측치 싹다 제거

(1184, 27)

In [83]:
# 사용자 정의함수 생성, 가중평균 계산
def weighted_math_average(df):
    weighted_math = df['UGDS'] * df['SATMTMID']
    return int(weighted_math.sum() / df['UGDS'].sum())

In [84]:
# 그룹(주)별로 한번씩 함수가 호출되서 하나의 값으로 출력됨
college2.groupby('STABBR').apply(weighted_math_average)

STABBR
AK    503
AL    536
AR    529
AZ    569
CA    564
     ... 
VT    566
WA    555
WI    593
WV    500
WY    540
Length: 53, dtype: int64

In [85]:
# 모든 열에 agg 적용해보면 각 집계열에 대해 값을 반환하려고 하기 때문에 오류 발생
# 저 그룹에 각 집계열은 UGDS나 SATMTMID 같은 다른 열에 접근권한을 가지고 있는게 아니기 때문에 에러
(college2
    .groupby('STABBR')
    .agg(weighted_math_average)
)

KeyError: 'UGDS'

In [None]:
# 마찬가지 SATMTMID로 열을 국한해도, UGDS라는 비집계열에 대한 접근 권한이 없어 에러발생
(college2
    .groupby('STABBR')
    ['SATMTMID'] 
    .agg(weighted_math_average)
)

In [86]:
# apply는 유연해서 Series를 반환해 다수의 새로운 열을 만들수도 있다
def weighted_average(df):
    weight_m = df['UGDS'] * df['SATMTMID']
    weight_v = df['UGDS'] * df['SATVRMID']
    wm_avg = weight_m.sum() / df['UGDS'].sum()
    wv_avg = weight_v.sum() / df['UGDS'].sum()
    data = {'w_math_avg': wm_avg,
           'w_verbal_avg': wv_avg,
           'math_avg': df['SATMTMID'].mean(),
           'verbal_avg': df['SATVRMID'].mean(),
           'count': len(df)
    }
    return pd.Series(data)


(college2
    .groupby('STABBR')
    .apply(weighted_average)
    .astype(int)
)

Unnamed: 0_level_0,w_math_avg,w_verbal_avg,math_avg,verbal_avg,count
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AK,503,555,503,555,1
AL,536,533,504,508,21
AR,529,504,515,491,16
AZ,569,557,536,538,6
CA,564,539,562,549,72
...,...,...,...,...,...
VT,566,564,526,527,8
WA,555,541,551,548,18
WI,593,556,545,516,14
WV,500,487,481,473,17


In [87]:
(college
    .groupby('STABBR')
    .apply(weighted_average)
)
# 이 예제에서 결측치를 제거하지 않으면 가중평균계산이 중단되어 잘못된 값을 출력하게 된다.

Unnamed: 0_level_0,w_math_avg,w_verbal_avg,math_avg,verbal_avg,count
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AK,5.548091,6.121651,503.000000,555.000000,10.0
AL,261.895658,260.550109,504.285714,508.476190,96.0
AR,301.054792,287.264872,515.937500,491.875000,86.0
AS,0.000000,0.000000,,,1.0
AZ,61.815821,60.511712,536.666667,538.333333,133.0
...,...,...,...,...,...
VT,389.967094,388.696848,526.875000,527.500000,27.0
WA,274.885878,267.880280,551.222222,548.333333,123.0
WI,153.803086,144.160115,545.071429,516.857143,112.0
WV,224.697582,218.843452,481.705882,473.411765,73.0


In [88]:
from scipy.stats import gmean, hmean
def calculate_means(df):
    df_means = pd.DataFrame(index=['Arithmetic', 'Weighted',
                                   'Geometric', 'Harmonic'])
    cols = ['SATMTMID', 'SATVRMID']
    for col in cols:
        arithmetic = df[col].mean()
        weighted = np.average(df[col], weights=df['UGDS'])
        geometric = gmean(df[col])
        harmonic = hmean(df[col])
        df_means[col] = [arithmetic, weighted,
                         geometric, harmonic]
    df_means['count'] = len(df)
    return df_means.astype(int)

# 평균, 가중평균, 기하평균, 조화평균에 대해 계산하는 사용자 정의함수를 
# apply를 통해 각 주별로 적용시킨 예시
(college2
    .groupby('STABBR')
    .apply(calculate_means)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,SATMTMID,SATVRMID,count
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,Arithmetic,503,555,1
AK,Weighted,503,555,1
AK,Geometric,503,555,1
AK,Harmonic,503,555,1
AL,Arithmetic,504,508,21
...,...,...,...,...
WV,Harmonic,480,472,17
WY,Arithmetic,540,535,1
WY,Weighted,540,535,1
WY,Geometric,540,534,1


## Grouping by continuous variables

In [89]:
# 이산(discrete) 값이 아닌 연속(continuous)한 값을 그룹화 하는 방법에 대한 예시
# 다양한 운항거리에 대한 항공사의 분포를 구하는 예시

flights = pd.read_csv('data/flights.csv')
flights

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,ORG_AIR,DEST_AIR,SCHED_DEP,DEP_DELAY,AIR_TIME,DIST,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,LAX,SLC,1625,58.0,94.0,590,1905,65.0,0,0
1,1,1,4,UA,DEN,IAD,823,7.0,154.0,1452,1333,-13.0,0,0
2,1,1,4,MQ,DFW,VPS,1305,36.0,85.0,641,1453,35.0,0,0
3,1,1,4,AA,DFW,DCA,1555,7.0,126.0,1192,1935,-7.0,0,0
4,1,1,4,WN,LAX,MCI,1720,48.0,166.0,1363,2225,39.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58487,12,31,4,AA,SFO,DFW,515,5.0,166.0,1464,1045,-19.0,0,0
58488,12,31,4,F9,LAS,SFO,1910,13.0,71.0,414,2050,4.0,0,0
58489,12,31,4,OO,SFO,SBA,1846,-6.0,46.0,262,1956,-5.0,0,0
58490,12,31,4,WN,MSP,ATL,525,39.0,124.0,907,855,34.0,0,0


In [90]:
# 거리 값을 구분하기 위한 기준 구간 값 설정
bins = [-np.inf, 200, 500, 1000, 2000, np.inf] 
# flights['DIST']를 bins에 따라 5개의 구간으로 카테고리형 변환
cuts = pd.cut(flights['DIST'], bins=bins)
cuts

0        (500.0, ...
1        (1000.0,...
2        (500.0, ...
3        (1000.0,...
4        (1000.0,...
            ...     
58487    (1000.0,...
58488    (200.0, ...
58489    (200.0, ...
58490    (500.0, ...
58491    (500.0, ...
Name: DIST, Length: 58492, dtype: category
Categories (5, interval[float64, right]): [(-inf, 2... < (200.0, ... < (500.0, ... < (1000.0,... < (2000.0,...]

In [91]:
cuts.value_counts()

(500.0, 1000.0]     20659
(200.0, 500.0]      15874
(1000.0, 2000.0]    14186
(2000.0, inf]        4054
(-inf, 200.0]        3719
Name: DIST, dtype: int64

In [92]:
# 각 거리 구간으로 그룹을 만들고 전체 항공편 중 항공사별 비중 계산 
(flights
    .groupby(cuts)
    ['AIRLINE']
    .value_counts(normalize=True) 
    .round(3)
)

DIST           AIRLINE
(-inf, 200.0]  OO         0.326
               EV         0.289
               MQ         0.211
               DL         0.086
               AA         0.052
                          ...  
(2000.0, inf]  WN         0.046
               HA         0.028
               NK         0.019
               AS         0.012
               F9         0.004
Name: AIRLINE, Length: 57, dtype: float64

In [93]:
# 요렇게 하면 거리 그룹별  백분위별 비행시간을 찾을 수 있음 
(flights
  .groupby(cuts)
  ['AIR_TIME']
  .quantile(q=[.25, .5, .75]) 
  .div(60)
  .round(2)
)

DIST                  
(-inf, 200.0]     0.25    0.43
                  0.50    0.50
                  0.75    0.57
(200.0, 500.0]    0.25    0.77
                  0.50    0.92
                          ... 
(1000.0, 2000.0]  0.50    2.93
                  0.75    3.40
(2000.0, inf]     0.25    4.30
                  0.50    4.70
                  0.75    5.03
Name: AIR_TIME, Length: 15, dtype: float64

In [94]:
# 예시가 맞나..? cut은 거리로 해두고 라벨은 시간으로 했는데... 
# 암튼 이렇게 구간에 라벨을 부여할수도 있음. 본예시는 unstack으로 펼치기까지 함
labels=['Under an Hour', '1 Hour', '1-2 Hours',
        '2-4 Hours', '4+ Hours']
cuts2 = pd.cut(flights['DIST'], bins=bins, labels=labels)
(flights
   .groupby(cuts2)
   ['AIRLINE']
   .value_counts(normalize=True) 
   .round(3) 
   .unstack() 
)

AIRLINE,AA,AS,B6,DL,EV,F9,HA,MQ,NK,OO,UA,US,VX,WN
DIST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Under an Hour,0.052,,,0.086,0.289,,,0.211,,0.326,0.027,,,0.009
1 Hour,0.071,0.001,0.007,0.189,0.156,0.005,,0.1,0.012,0.159,0.062,0.016,0.028,0.194
1-2 Hours,0.144,0.023,0.003,0.206,0.101,0.038,,0.051,0.03,0.106,0.131,0.025,0.004,0.138
2-4 Hours,0.264,0.016,0.003,0.165,0.016,0.031,,0.003,0.045,0.046,0.199,0.04,0.012,0.16
4+ Hours,0.212,0.012,0.08,0.171,,0.004,0.028,,0.019,,0.289,0.065,0.074,0.046


## Counting the total number of flights between cities

In [95]:
# 두 도시 사이의 총 비행횟수를 세는 예시 
flights = pd.read_csv('data/flights.csv')
flights_ct = flights.groupby(['ORG_AIR', 'DEST_AIR']).size()
# 요로코롬 출발지와 도착지에 대해 그룹화 시키고 size로 몇회인지 반환
flights_ct

ORG_AIR  DEST_AIR
ATL      ABE          31
         ABQ          16
         ABY          19
         ACY           6
         AEX          40
                    ... 
SFO      SNA         122
         STL          20
         SUN          10
         TUS          20
         XNA           2
Length: 1130, dtype: int64

In [96]:
# 이렇게도 할수 있지만... 모든 항공편에 적용할수 있는 해법이 필요
flights_ct.loc[[('ATL', 'IAH'), ('IAH', 'ATL')]]

ORG_AIR  DEST_AIR
ATL      IAH         121
IAH      ATL         148
dtype: int64

In [97]:
# 각 행의 출발지와 목적지 열을 알파벳 순으로 정렬
# 요렇게하면 열의 의미가 없어지고 두 열의 값이 각 행에대해 정렬되버림
f_part3 = (flights  # doctest: +SKIP
  [['ORG_AIR', 'DEST_AIR']] 
  .apply(lambda ser:
         ser.sort_values().reset_index(drop=True),
         axis='columns')
)
f_part3

Unnamed: 0,0,1
0,LAX,SLC
1,DEN,IAD
2,DFW,VPS
3,DCA,DFW
4,LAX,MCI
...,...,...
58487,DFW,SFO
58488,LAS,SFO
58489,SBA,SFO
58490,ATL,MSP


In [98]:
# 열이름 재지정해주고
# 다시 그룹화 시켜봄
rename_dict = {0:'AIR1', 1:'AIR2'}  
(flights     # doctest: +SKIP
  [['ORG_AIR', 'DEST_AIR']]
  .apply(lambda ser:
         ser.sort_values().reset_index(drop=True),
         axis='columns')
  .rename(columns=rename_dict)
  .groupby(['AIR1', 'AIR2'])
  .size()
)

AIR1  AIR2
ABE   ATL      31
      ORD      24
ABI   DFW      74
ABQ   ATL      16
      DEN      46
             ... 
SFO   SNA     122
      STL      20
      SUN      10
      TUS      20
      XNA       2
Length: 1085, dtype: int64

In [99]:
# 
(flights     # doctest: +SKIP
  [['ORG_AIR', 'DEST_AIR']]
  .apply(lambda ser:
         ser.sort_values().reset_index(drop=True),
         axis='columns') #
  .rename(columns=rename_dict)
  .groupby(['AIR1', 'AIR2'])
  .size()
  .loc[('ATL', 'IAH')]
)

269

In [92]:
# 행 별로 정렬되어있어 순서를 바꾸면 오류가 나게 된다.(해당 키가 없다고)
(flights     # doctest: +SKIP
  [['ORG_AIR', 'DEST_AIR']]
  .apply(lambda ser:
         ser.sort_values().reset_index(drop=True),
         axis='columns')
  .rename(columns=rename_dict)
  .groupby(['AIR1', 'AIR2'])
  .size()
  .loc[('IAH', 'ATL')]
)

KeyError: ('IAH', 'ATL')

### There's more ...

In [108]:
# axis = 'columns'로 .apply 호출하는 것은 성능이 낮은 작업으로 더 큰데이터에서 사용은 힘듬
# Numpy sort를 사용하면 속도를 크게 올릴수 있다는 예시
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
data_sorted[:10]

array([['LAX', 'SLC'],
       ['DEN', 'IAD'],
       ['DFW', 'VPS'],
       ['DCA', 'DFW'],
       ['LAX', 'MCI'],
       ['IAH', 'SAN'],
       ['DFW', 'MSY'],
       ['PHX', 'SFO'],
       ['ORD', 'STL'],
       ['IAH', 'SJC']], dtype=object)

In [109]:
flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
flights_sort2.equals(f_part3.rename(columns={'ORG_AIR':'AIR1',
    'DEST_AIR':'AIR2'}))

False

In [110]:
%%timeit
flights_sort = (flights   # doctest: +SKIP
    [['ORG_AIR', 'DEST_AIR']] 
   .apply(lambda ser:
         ser.sort_values().reset_index(drop=True),
         axis='columns')
)

9.72 s ± 68.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [111]:
%%timeit
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
flights_sort2 = pd.DataFrame(data_sorted,
    columns=['AIR1', 'AIR2'])

7.77 ms ± 305 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Finding the longest streak of on-time flights

In [96]:
# 각 출발지에서 항공사 별로 가장 긴 연속 정시 비행을 찾는 예시
#요건 짧은 예시
s = pd.Series([0, 1, 1, 0, 1, 1, 1, 0])
s

0    0
1    1
2    1
3    0
4    1
5    1
6    1
7    0
dtype: int64

In [97]:
# .cumsum() 누적합
s1 = s.cumsum()
s1

0    0
1    1
2    2
3    2
4    3
5    4
6    5
7    5
dtype: int64

In [98]:
# 누적합과 기존데이터간의 곱
s.mul(s1)

0    0
1    1
2    2
3    0
4    3
5    4
6    5
7    0
dtype: int64

In [99]:
# 각 인덱스간의 차이
s.mul(s1).diff()

0    NaN
1    1.0
2    1.0
3   -2.0
4    3.0
5    1.0
6    1.0
7   -5.0
dtype: float64

In [100]:
# 저 차이에서 음수는 연속된 1이 끝났음을 의미
# 끝난 지점 외에는 죄다 nan 처리
(s
    .mul(s.cumsum())
    .diff()
    .where(lambda x: x < 0)
)

0    NaN
1    NaN
2    NaN
3   -2.0
4    NaN
5    NaN
6    NaN
7   -5.0
dtype: float64

In [101]:
# ffill을 쓰면 아래로 값을 전파
(s
    .mul(s.cumsum())
    .diff()
    .where(lambda x: x < 0)
    .ffill()
)

0    NaN
1    NaN
2    NaN
3   -2.0
4   -2.0
5   -2.0
6   -2.0
7   -5.0
dtype: float64

In [102]:
# 전파한 값에 누적합산을 더하면 초과누적 상쇄
(s
    .mul(s.cumsum())
    .diff()
    .where(lambda x: x < 0)
    .ffill()
    .add(s.cumsum(), fill_value=0)
)

0    0.0
1    1.0
2    2.0
3    0.0
4    1.0
5    2.0
6    3.0
7    0.0
dtype: float64

In [103]:
# 이제 본 예제 시작
flights = pd.read_csv('data/flights.csv')
(flights
    .assign(ON_TIME=flights['ARR_DELAY'].lt(15).astype(int))
    [['AIRLINE', 'ORG_AIR', 'ON_TIME']]
)

Unnamed: 0,AIRLINE,ORG_AIR,ON_TIME
0,WN,LAX,0
1,UA,DEN,1
2,MQ,DFW,0
3,AA,DFW,1
4,WN,LAX,0
...,...,...,...
58487,AA,SFO,1
58488,F9,LAS,1
58489,OO,SFO,1
58490,WN,MSP,0


In [104]:
# 위에서 했던 예시를 고대로 적용해서 Series의 최대 연속을 반환하는 함수를 정의

def max_streak(s):
    s1 = s.cumsum()
    return (s
       .mul(s1)
       .diff()
       .where(lambda x: x < 0) 
       .ffill()
       .add(s1, fill_value=0)
       .max()
    )

In [105]:
# 항공사와 출발지 공항별 정시도착비율, 비행횟수, 최대연속 정시 비행횟수 구하는 코드
(flights
    .assign(ON_TIME=flights['ARR_DELAY'].lt(15).astype(int))
    .sort_values(['MONTH', 'DAY', 'SCHED_DEP']) 
    .groupby(['AIRLINE', 'ORG_AIR'])
    ['ON_TIME'] 
    .agg(['mean', 'size', max_streak])
    .round(2)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size,max_streak
AIRLINE,ORG_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,ATL,0.82,233,15
AA,DEN,0.74,219,17
AA,DFW,0.78,4006,64
AA,IAH,0.80,196,24
AA,LAS,0.79,374,29
...,...,...,...,...
WN,LAS,0.77,2031,39
WN,LAX,0.70,1135,23
WN,MSP,0.84,237,32
WN,PHX,0.77,1724,33


### There's more...

In [106]:
# 가장 긴 연착 찾기
# Ontime 아닌거 찾아서 똑같이 돌리면 됩니다.. 
def max_delay_streak(df):
    df = df.reset_index(drop=True)
    late = 1 - df['ON_TIME']
    late_sum = late.cumsum()
    streak = (late
        .mul(late_sum)
        .diff()
        .where(lambda x: x < 0) 
        .ffill()
        .add(late_sum, fill_value=0)
    )
    last_idx = streak.idxmax()
    first_idx = last_idx - streak.max() + 1
    res = (df
        .loc[[first_idx, last_idx], ['MONTH', 'DAY']]
        .assign(streak=streak.max())
    )
    res.index = ['first', 'last']
    return res

In [110]:
(flights
    .assign(ON_TIME=flights['ARR_DELAY'].lt(15).astype(int))
    .sort_values(['MONTH', 'DAY', 'SCHED_DEP']) 
    .groupby(['AIRLINE', 'ORG_AIR']) 
    .apply(max_delay_streak) 
    .sort_values('streak', ascending=False)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,MONTH,DAY,streak
AIRLINE,ORG_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,DFW,first,2.0,26.0,38.0
AA,DFW,last,3.0,1.0,38.0
MQ,ORD,last,1.0,12.0,28.0
MQ,ORD,first,1.0,6.0,28.0
MQ,DFW,last,2.0,26.0,25.0
...,...,...,...,...,...
US,LAS,last,1.0,7.0,1.0
AS,ATL,first,5.0,4.0,1.0
OO,LAS,first,2.0,8.0,1.0
EV,PHX,last,8.0,1.0,0.0
