# 4. Data Transformation
## 4.1 Introduction 
This practice focuses on learning data transformation using the pandas package and a new dataset on flights that departed New York City in 2013.

In [37]:
import pandas as pd

# Then we need to download the data from the Internet


In [38]:
url = "https://raw.githubusercontent.com/byuidatascience/data4python4ds/master/data-raw/flights/flights.csv"
flights = pd.read_csv(url)

In [39]:
flights.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T10:00:00Z
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T10:00:00Z
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01T10:00:00Z
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01T10:00:00Z
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01T11:00:00Z


In [40]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 19 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   year            336776 non-null  int64  
 1   month           336776 non-null  int64  
 2   day             336776 non-null  int64  
 3   dep_time        328521 non-null  float64
 4   sched_dep_time  336776 non-null  int64  
 5   dep_delay       328521 non-null  float64
 6   arr_time        328063 non-null  float64
 7   sched_arr_time  336776 non-null  int64  
 8   arr_delay       327346 non-null  float64
 9   carrier         336776 non-null  object 
 10  flight          336776 non-null  int64  
 11  tailnum         334264 non-null  object 
 12  origin          336776 non-null  object 
 13  dest            336776 non-null  object 
 14  air_time        327346 non-null  float64
 15  distance        336776 non-null  int64  
 16  hour            336776 non-null  int64  
 17  minute    

In [41]:
flights ["time_hour"]

0         2013-01-01T10:00:00Z
1         2013-01-01T10:00:00Z
2         2013-01-01T10:00:00Z
3         2013-01-01T10:00:00Z
4         2013-01-01T11:00:00Z
                  ...         
336771    2013-09-30T18:00:00Z
336772    2013-10-01T02:00:00Z
336773    2013-09-30T16:00:00Z
336774    2013-09-30T15:00:00Z
336775    2013-09-30T12:00:00Z
Name: time_hour, Length: 336776, dtype: object

In [42]:
flights["time_hour"] = pd.to_datetime(flights["time_hour"], format="%Y-%m-%dT%H:%M:%SZ")

# 4.2. pandas basics
DataFrame in pandas is a 2-dimensional data structure where each columsn can store a different data type. The index carries information about each row and the column names carry information about each column. The data are stored in form of collections of columns. 

In [43]:
# Here is a little made-up example where we can see how this data frame works.
df = pd.DataFrame (
    data ={
        "time": [0,0,0,0], # first define the name of each column, then we import data in it
        "col1": [0,0,0,"nice"], # Data stored in the same column do not need to be of the same type
        "col2": ["a","b","b","a"],
        "col4": ["alpha", "gamma","gamma", "gamma"],
          
    },
    index = ["row" + str(i) for i in range(4)]
)
df.head()

Unnamed: 0,time,col1,col2,col4
row0,0,0,a,alpha
row1,0,0,b,gamma
row2,0,0,b,gamma
row3,0,nice,a,gamma


# 4.3. Manipulating Rows in Data Frames 

In [44]:
import numpy as np

df = pd.DataFrame (
    data = np.reshape (range (36), (6,6)),
    index = ["a", "b","c","d","e","f"],
    columns = ["col"+str (i) for i in range (6)],
    dtype = float,
)
df ["col6"]=["apple", "orange", "pineapple","mango","kiwi","lemon"]
df

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6
a,0.0,1.0,2.0,3.0,4.0,5.0,apple
b,6.0,7.0,8.0,9.0,10.0,11.0,orange
c,12.0,13.0,14.0,15.0,16.0,17.0,pineapple
d,18.0,19.0,20.0,21.0,22.0,23.0,mango
e,24.0,25.0,26.0,27.0,28.0,29.0,kiwi
f,30.0,31.0,32.0,33.0,34.0,35.0,lemon


## 4.3.1. Accessing Rows

In [45]:
df.loc [["a","b"]] # df.loc [["row_name1 ","row_name2"]]

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6
a,0.0,1.0,2.0,3.0,4.0,5.0,apple
b,6.0,7.0,8.0,9.0,10.0,11.0,orange


In [46]:
df.iloc[0] # access particular rows based on their location

col0      0.0
col1      1.0
col2      2.0
col3      3.0
col4      4.0
col5      5.0
col6    apple
Name: a, dtype: object

In [47]:
df.iloc [[0,2]]

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6
a,0.0,1.0,2.0,3.0,4.0,5.0,apple
c,12.0,13.0,14.0,15.0,16.0,17.0,pineapple


In [48]:
# use the function query() to filter rows
df.query ("col6=='kiwi' or col6 =='pineapple'")

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6
c,12.0,13.0,14.0,15.0,16.0,17.0,pineapple
e,24.0,25.0,26.0,27.0,28.0,29.0,kiwi


In [49]:
df.query ("col0>6")

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6
c,12.0,13.0,14.0,15.0,16.0,17.0,pineapple
d,18.0,19.0,20.0,21.0,22.0,23.0,mango
e,24.0,25.0,26.0,27.0,28.0,29.0,kiwi
f,30.0,31.0,32.0,33.0,34.0,35.0,lemon


In [50]:
# apply query() function to Flights data 
flights.query ("month ==1 and day ==1")

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 10:00:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 10:00:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 10:00:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 10:00:00
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 11:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
837,2013,1,1,2356.0,2359,-3.0,425.0,437,-12.0,B6,727,N588JB,JFK,BQN,186.0,1576,23,59,2013-01-02 04:00:00
838,2013,1,1,,1630,,,1815,,EV,4308,N18120,EWR,RDU,,416,16,30,2013-01-01 21:00:00
839,2013,1,1,,1935,,,2240,,AA,791,N3EHAA,LGA,DFW,,1389,19,35,2013-01-02 00:00:00
840,2013,1,1,,1500,,,1825,,AA,1925,N3EVAA,LGA,MIA,,1096,15,0,2013-01-01 20:00:00


## 4.3.3. Re-arranging Rows
Sometimes, we want to re-order the rows of the data from according to the values in a particular column. pandas allows us to do this using the function .sort_values (). If you provdive more than one column name, each additional column will be used to break ties in the values of preceding columns. 

In [51]:
# This following code shows how to sort the data by departure time
flights.sort_values (["year","month","day", "dep_time"])

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 10:00:00
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 10:00:00
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 10:00:00
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 10:00:00
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 11:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111291,2013,12,31,,705,,,931,,UA,1729,,EWR,DEN,,1605,7,5,2013-12-31 12:00:00
111292,2013,12,31,,825,,,1029,,US,1831,,JFK,CLT,,541,8,25,2013-12-31 13:00:00
111293,2013,12,31,,1615,,,1800,,MQ,3301,N844MQ,LGA,RDU,,431,16,15,2013-12-31 21:00:00
111294,2013,12,31,,600,,,735,,UA,219,,EWR,ORD,,719,6,0,2013-12-31 11:00:00


In [52]:
# we can use ascending = False to re-order the column in a descending order. 
flights.sort_values ("dep_delay", ascending =False )

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
7072,2013,1,9,641.0,900,1301.0,1242.0,1530,1272.0,HA,51,N384HA,JFK,HNL,640.0,4983,9,0,2013-01-09 14:00:00
235778,2013,6,15,1432.0,1935,1137.0,1607.0,2120,1127.0,MQ,3535,N504MQ,JFK,CMH,74.0,483,19,35,2013-06-15 23:00:00
8239,2013,1,10,1121.0,1635,1126.0,1239.0,1810,1109.0,MQ,3695,N517MQ,EWR,ORD,111.0,719,16,35,2013-01-10 21:00:00
327043,2013,9,20,1139.0,1845,1014.0,1457.0,2210,1007.0,AA,177,N338AA,JFK,SFO,354.0,2586,18,45,2013-09-20 22:00:00
270376,2013,7,22,845.0,1600,1005.0,1044.0,1815,989.0,MQ,3075,N665MQ,JFK,CVG,96.0,589,16,0,2013-07-22 20:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336771,2013,9,30,,1455,,,1634,,9E,3393,,JFK,DCA,,213,14,55,2013-09-30 18:00:00
336772,2013,9,30,,2200,,,2312,,9E,3525,,LGA,SYR,,198,22,0,2013-10-01 02:00:00
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10,2013-09-30 16:00:00
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59,2013-09-30 15:00:00


# 4.4 Manipulating Columns

## 4.4.1. Creating New Columns 

In [53]:
df["new_column0"]=[0,1,2,3,4,5]
df

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,new_column0
a,0.0,1.0,2.0,3.0,4.0,5.0,apple,0
b,6.0,7.0,8.0,9.0,10.0,11.0,orange,1
c,12.0,13.0,14.0,15.0,16.0,17.0,pineapple,2
d,18.0,19.0,20.0,21.0,22.0,23.0,mango,3
e,24.0,25.0,26.0,27.0,28.0,29.0,kiwi,4
f,30.0,31.0,32.0,33.0,34.0,35.0,lemon,5


In [54]:
# We can create more than one columns at a time 
df[["new_column1","new_column2"]]=[5,6]
df

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,new_column0,new_column1,new_column2
a,0.0,1.0,2.0,3.0,4.0,5.0,apple,0,5,6
b,6.0,7.0,8.0,9.0,10.0,11.0,orange,1,5,6
c,12.0,13.0,14.0,15.0,16.0,17.0,pineapple,2,5,6
d,18.0,19.0,20.0,21.0,22.0,23.0,mango,3,5,6
e,24.0,25.0,26.0,27.0,28.0,29.0,kiwi,4,5,6
f,30.0,31.0,32.0,33.0,34.0,35.0,lemon,5,5,6


In [55]:
# Very often, you would want to create a column that is the result of an operation on existing columns. 
df["new_column3"]=df["col0"]-df["new_column0"]
df

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,new_column0,new_column1,new_column2,new_column3
a,0.0,1.0,2.0,3.0,4.0,5.0,apple,0,5,6,0.0
b,6.0,7.0,8.0,9.0,10.0,11.0,orange,1,5,6,5.0
c,12.0,13.0,14.0,15.0,16.0,17.0,pineapple,2,5,6,10.0
d,18.0,19.0,20.0,21.0,22.0,23.0,mango,3,5,6,15.0
e,24.0,25.0,26.0,27.0,28.0,29.0,kiwi,4,5,6,20.0
f,30.0,31.0,32.0,33.0,34.0,35.0,lemon,5,5,6,25.0


## 4.4.2 Accessing Columns


In [56]:
df ["col0"] # call a particular column by its name 

a     0.0
b     6.0
c    12.0
d    18.0
e    24.0
f    30.0
Name: col0, dtype: float64

In [57]:
# If you need to call multiple columns, you need to pass down a list instead of a string 
df[["col0","col1","col2"]]

Unnamed: 0,col0,col1,col2
a,0.0,1.0,2.0
b,6.0,7.0,8.0
c,12.0,13.0,14.0
d,18.0,19.0,20.0
e,24.0,25.0,26.0
f,30.0,31.0,32.0


In [58]:
# If you want to access particular rows at the same time, use .loc
df.loc [["a","b"],["col0","col1","col2"]]

Unnamed: 0,col0,col1,col2
a,0.0,1.0,2.0
b,6.0,7.0,8.0


In [59]:
# We can also access rows by their position using .iloc
df.iloc[:,[0,1]]

Unnamed: 0,col0,col1
a,0.0,1.0
b,6.0,7.0
c,12.0,13.0
d,18.0,19.0
e,24.0,25.0
f,30.0,31.0


In [60]:
# Sometimes, we might want to select columns based on some patterns. Here, our task is to select columns in our flights data where the column names start with the word, "new"

print ("The list of columns")
print (df.columns)

print ("The list of true and false values: ")
print (df.columns.str.startswith ("new"))
print ("\n")

print ("The selection from the data frame is: ")
df.loc [:,df.columns.str.startswith ("new")]

The list of columns
Index(['col0', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'new_column0',
       'new_column1', 'new_column2', 'new_column3'],
      dtype='object')
The list of true and false values: 
[False False False False False False False  True  True  True  True]


The selection from the data frame is: 


Unnamed: 0,new_column0,new_column1,new_column2,new_column3
a,0,5,6,0.0
b,1,5,6,5.0
c,2,5,6,10.0
d,3,5,6,15.0
e,4,5,6,20.0
f,5,5,6,25.0


## Renaming Columns
There are three ways that you can rename a column in python. The first is to use the rename () function with an object called a dictionary. A dictionary in Python consist of curly brackets with comma seperated pairs of values where the first value maps into the second value.

In [61]:
df.rename (columns= {"col3": "letters","col4":"names", "col6":"names", "col6":"fruit"})

Unnamed: 0,col0,col1,col2,letters,names,col5,fruit,new_column0,new_column1,new_column2,new_column3
a,0.0,1.0,2.0,3.0,4.0,5.0,apple,0,5,6,0.0
b,6.0,7.0,8.0,9.0,10.0,11.0,orange,1,5,6,5.0
c,12.0,13.0,14.0,15.0,16.0,17.0,pineapple,2,5,6,10.0
d,18.0,19.0,20.0,21.0,22.0,23.0,mango,3,5,6,15.0
e,24.0,25.0,26.0,27.0,28.0,29.0,kiwi,4,5,6,20.0
f,30.0,31.0,32.0,33.0,34.0,35.0,lemon,5,5,6,25.0


The second method is for when you want to rename all of the columns. For that you simply set df.columns equal to the new set of columns that you'd like to have. 

In [62]:
df.columns = df.columns.str.capitalize() # This fuction allows you to capitalize all the column names
df

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Col5,Col6,New_column0,New_column1,New_column2,New_column3
a,0.0,1.0,2.0,3.0,4.0,5.0,apple,0,5,6,0.0
b,6.0,7.0,8.0,9.0,10.0,11.0,orange,1,5,6,5.0
c,12.0,13.0,14.0,15.0,16.0,17.0,pineapple,2,5,6,10.0
d,18.0,19.0,20.0,21.0,22.0,23.0,mango,3,5,6,15.0
e,24.0,25.0,26.0,27.0,28.0,29.0,kiwi,4,5,6,20.0
f,30.0,31.0,32.0,33.0,34.0,35.0,lemon,5,5,6,25.0


Finally, we might be interested in just replacing specific parts of the column names. In this case, we can use .str.replace(). As an example, let's add the word "Orginal" ahead of the original columns. 

In [66]:
df.columns=df.columns.str.replace ("Col","Original_column")# You have to assign your new column names to the old columns
df

Unnamed: 0,Original_column0,Original_column1,Original_column2,Original_column3,Original_column4,Original_column5,Original_column6,New_column0,New_column1,New_column2,New_column3
a,0.0,1.0,2.0,3.0,4.0,5.0,apple,0,5,6,0.0
b,6.0,7.0,8.0,9.0,10.0,11.0,orange,1,5,6,5.0
c,12.0,13.0,14.0,15.0,16.0,17.0,pineapple,2,5,6,10.0
d,18.0,19.0,20.0,21.0,22.0,23.0,mango,3,5,6,15.0
e,24.0,25.0,26.0,27.0,28.0,29.0,kiwi,4,5,6,20.0
f,30.0,31.0,32.0,33.0,34.0,35.0,lemon,5,5,6,25.0


## 4.4.4. Re-ordering Columns 
The simplest way to re-order (all) columns is to create a new list of their names with them in the order that you'd like them. 

In [67]:
df= pd.DataFrame (
    data=np.reshape (range(36),(6,6)),
    index = ["a","b","c","d","e","f"],
    columns = ["col"+str(i) for i in range (6)],
    dtype =float,
)
df

Unnamed: 0,col0,col1,col2,col3,col4,col5
a,0.0,1.0,2.0,3.0,4.0,5.0
b,6.0,7.0,8.0,9.0,10.0,11.0
c,12.0,13.0,14.0,15.0,16.0,17.0
d,18.0,19.0,20.0,21.0,22.0,23.0
e,24.0,25.0,26.0,27.0,28.0,29.0
f,30.0,31.0,32.0,33.0,34.0,35.0


In [69]:
df = df[["col5","col3","col1","col4","col2","col0"]] # Switch the order of columns
df

Unnamed: 0,col5,col3,col1,col4,col2,col0
a,5.0,3.0,1.0,4.0,2.0,0.0
b,11.0,9.0,7.0,10.0,8.0,6.0
c,17.0,15.0,13.0,16.0,14.0,12.0
d,23.0,21.0,19.0,22.0,20.0,18.0
e,29.0,27.0,25.0,28.0,26.0,24.0
f,35.0,33.0,31.0,34.0,32.0,30.0


# 4.6 Grouping, changing the index, and applying summary statistics
pandas gets even more powerful when you add in the ability to work with groups. Creating groups will often mean a change of index. And because groups tend to imply an aggregation or pooling data, they often go hand-in-hand with the application of a summary statistic. This can be a way to sort data. 

## 4.6.1. Grouping and Aggregating
We use .groupby() function followed by selecting a column and applying a summary statistic via an aggregation (.agg()) The .agg() function always produces a new index because we have collapsed information down to the group-level and the new index is made of those levels. 

The key point to remember is: use .groupby() and .agg() when you want your groups to become the new index.

In [70]:
# Syntax DataFrame.groupby("column_to_group_by")[["column_to_aggregate"]].aggregation_function()
(flights.groupby ("month")[["dep_delay"]].mean())
# This now represents the mean departure delay by month. Notice that out index has changed. We now have month where we originall had an index taht was the row number.

Unnamed: 0_level_0,dep_delay
month,Unnamed: 1_level_1
1,10.036665
2,10.816843
3,13.227076
4,13.938038
5,12.986859
6,20.846332
7,21.727787
8,12.61104
9,6.722476
10,6.243988


In [71]:
flights.groupby("month")["dep_delay"].agg(["mean", "max", "min"])
# The .agg() function allows you to do multiple statistical operations on columns at once.

Unnamed: 0_level_0,mean,max,min
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,10.036665,1301.0,-30.0
2,10.816843,853.0,-33.0
3,13.227076,911.0,-25.0
4,13.938038,960.0,-21.0
5,12.986859,878.0,-24.0
6,20.846332,1137.0,-21.0
7,21.727787,1005.0,-22.0
8,12.61104,520.0,-26.0
9,6.722476,1014.0,-24.0
10,6.243988,702.0,-25.0


In [72]:
flights.groupby("month").agg({
    "dep_delay": ["mean", "max"],
    "arr_delay": ["mean", "std"]
})
# This allows you to operate on multiple columns with multiple statistics

Unnamed: 0_level_0,dep_delay,dep_delay,arr_delay,arr_delay
Unnamed: 0_level_1,mean,max,mean,std
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,10.036665,1301.0,6.129972,40.423898
2,10.816843,853.0,5.613019,39.528619
3,13.227076,911.0,5.807577,44.119192
4,13.938038,960.0,11.176063,47.491151
5,12.986859,878.0,3.521509,44.237613
6,20.846332,1137.0,16.48133,56.130866
7,21.727787,1005.0,16.711307,57.117088
8,12.61104,520.0,6.040652,42.595142
9,6.722476,1014.0,-4.018364,39.710309
10,6.243988,702.0,-0.167063,32.649858


In [74]:
(
    flights.groupby (["month"]).agg(
        mean_delay = ("dep_delay","mean"),
        count_flights = ("dep_delay","count"),
    )
)

Unnamed: 0_level_0,mean_delay,count_flights
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10.036665,26483
2,10.816843,23690
3,13.227076,27973
4,13.938038,27662
5,12.986859,28233
6,20.846332,27234
7,21.727787,28485
8,12.61104,28841
9,6.722476,27122
10,6.243988,28653


# 4.6.2 Grouping by multiple variables

This is as simple as passing .groupby () a list representing multiple columns instead of a string representing a single column.

In [75]:
month_year_delay=flights.groupby (["month","year"]).agg(
    mean_delay = ("dep_delay","mean"),
    count_flights = ("dep_delay","count"),
)
month_year_delay

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_delay,count_flights
month,year,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2013,10.036665,26483
2,2013,10.816843,23690
3,2013,13.227076,27973
4,2013,13.938038,27662
5,2013,12.986859,28233
6,2013,20.846332,27234
7,2013,21.727787,28485
8,2013,12.61104,28841
9,2013,6.722476,27122
10,2013,6.243988,28653


This time, we have multi-index, that is, an index with more than one column. That's because we asked for something with multiple groups, and the index tracks what's going on within each group: so we need more than one dimension of index to do this efficiently.

In [77]:
month_year_delay.reset_index() # In essence, the .groupby() function manipulates with index as if we create a new column.

Unnamed: 0,month,year,mean_delay,count_flights
0,1,2013,10.036665,26483
1,2,2013,10.816843,23690
2,3,2013,13.227076,27973
3,4,2013,13.938038,27662
4,5,2013,12.986859,28233
5,6,2013,20.846332,27234
6,7,2013,21.727787,28485
7,8,2013,12.61104,28841
8,9,2013,6.722476,27122
9,10,2013,6.243988,28653
