In [1]:
# Import Dependencies
import pandas as pd

In [2]:
# Read the CSV into a Pandas DataFrame
ufo_df = pd.read_csv('Resources/ufoSightings.csv', low_memory=False)

# Remove the rows with missing data
clean_ufo_df = ufo_df.dropna(how="any")

# Converting the "duration (seconds)" column's values to numeric
converted_ufo_df = clean_ufo_df.copy()
converted_ufo_df["duration (seconds)"] = converted_ufo_df.loc[:, "duration (seconds)"].astype(float)

# Display the DataFrame
converted_ufo_df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700.0,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
3,10/10/1956 21:00,edna,tx,us,circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611
5,10/10/1961 19:00,bristol,tn,us,sphere,300.0,5 minutes,My father is now 89 my brother 52 the girl wit...,4/27/2007,36.595,-82.188889
7,10/10/1965 23:45,norwalk,ct,us,disk,1200.0,20 minutes,A bright orange color changing to reddish colo...,10/2/1999,41.1175,-73.408333


In [3]:
# Add double brackets around the column to for the aggregation to create a DataFrame.
grouped_ufo_duration_shape = converted_ufo_df.groupby("shape")[["duration (seconds)"]].mean()
grouped_ufo_duration_shape.head(10)

Unnamed: 0_level_0,duration (seconds)
shape,Unnamed: 1_level_1
changed,3600.0
changing,2111.616031
chevron,472.417782
cigar,2148.050379
circle,3650.816269
cone,1643.70428
crescent,37800.0
cross,752.025381
cylinder,3954.055607
delta,2307.857143


In [4]:
# The agg() function can be used for aggregation. 


Unnamed: 0_level_0,duration (seconds)
shape,Unnamed: 1_level_1
changed,3600.0
changing,2111.616031
chevron,472.417782
cigar,2148.050379
circle,3650.816269
cone,1643.70428
crescent,37800.0
cross,752.025381
cylinder,3954.055607
delta,2307.857143


### Multiple Aggregations

In [5]:
# The agg() function can be used to pass more than one aggregation.


Unnamed: 0_level_0,duration (seconds),duration (seconds)
Unnamed: 0_level_1,mean,sum
shape,Unnamed: 1_level_2,Unnamed: 2_level_2
changed,3600.0,3600.0
changing,2111.616031,3490501.3
chevron,472.417782,402499.95
cigar,2148.050379,3688202.5
circle,3650.816269,23383478.2
cone,1643.70428,422432.0
crescent,37800.0,37800.0
cross,752.025381,148149.0
cylinder,3954.055607,4266426.0
delta,2307.857143,16155.0


### Grouping on Multiple Columns with Multiple Aggregations

In [6]:
# It is also possible to group a DataFrame by multiple columns
# This returns an object with multiple indices, however, which can be harder to deal with.
# Get the average duration in seconds of UFOs by Country and State.


# Display the DataFrame.


Unnamed: 0_level_0,Unnamed: 1_level_0,duration (seconds)
country,state,Unnamed: 2_level_1
au,al,900.0
au,dc,300.0
au,nt,180.0
au,oh,180.0
au,sa,152.5
au,wa,225.0
au,yt,30.0
ca,ab,1869.697183
ca,bc,948.236071
ca,mb,1291.387097


In [7]:
# The agg() function can be used to pass more than one aggregation.


Unnamed: 0_level_0,Unnamed: 1_level_0,duration (seconds),duration (seconds),duration (seconds)
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,sum
country,state,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
au,al,1,900.0,900.0
au,dc,1,300.0,300.0
au,nt,2,180.0,360.0
au,oh,1,180.0,180.0
au,sa,2,152.5,305.0
au,wa,2,225.0,450.0
au,yt,1,30.0,30.0
ca,ab,284,1869.697183,530994.0
ca,bc,677,948.236071,641955.82
ca,mb,124,1291.387097,160132.0


### Flattening Multi-Indexed Columns to Single Columns

In [8]:
# One method of flattening the MultiIndex columns to a single column: use the to_flat_index() on the columns.



# Get the columns after apply the to_flat_index()


# Display the columns


Index([('duration (seconds)', 'count'),  ('duration (seconds)', 'mean'),
         ('duration (seconds)', 'sum')],
      dtype='object')

In [9]:
# Use a list comprehension to join the each tuple for each column. 


# Display the flattened DataFrame


Unnamed: 0_level_0,Unnamed: 1_level_0,duration (seconds)_count,duration (seconds)_mean,duration (seconds)_sum
country,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
au,al,1,900.000000,900.00
au,dc,1,300.000000,300.00
au,nt,2,180.000000,360.00
au,oh,1,180.000000,180.00
au,sa,2,152.500000,305.00
...,...,...,...,...
us,vt,254,1042.462598,264785.50
us,wa,3707,15273.474357,56618769.44
us,wi,1205,1928.422656,2323749.30
us,wv,438,6791.901826,2974853.00


In [10]:
# The second method for flattening the multiIndex to one column.
# Get the first level of the multi-index


# Get the second level of the multi-index


Index(['duration (seconds)', 'duration (seconds)', 'duration (seconds)'], dtype='object')
Index(['count', 'mean', 'sum'], dtype='object')


In [11]:
# Combine the levels and display the DataFrame. 


Unnamed: 0_level_0,Unnamed: 1_level_0,duration (seconds)_count,duration (seconds)_mean,duration (seconds)_sum
country,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
au,al,1,900.000000,900.00
au,dc,1,300.000000,300.00
au,nt,2,180.000000,360.00
au,oh,1,180.000000,180.00
au,sa,2,152.500000,305.00
...,...,...,...,...
us,vt,254,1042.462598,264785.50
us,wa,3707,15273.474357,56618769.44
us,wi,1205,1928.422656,2323749.30
us,wv,438,6791.901826,2974853.00


In [12]:
# Get the new column names to rename the columns


Index(['duration (seconds)_count', 'duration (seconds)_mean',
       'duration (seconds)_sum'],
      dtype='object')

In [13]:
# Rename the columns in a multi-index DataFrame



Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Sightings,Avg Duration(seconds),Total Duration(seconds)
country,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
au,al,1,900.0,900.0
au,dc,1,300.0,300.0
au,nt,2,180.0,360.0
au,oh,1,180.0,180.0
au,sa,2,152.5,305.0
au,wa,2,225.0,450.0
au,yt,1,30.0,30.0
ca,ab,284,1869.697183,530994.0
ca,bc,677,948.236071,641955.82
ca,mb,124,1291.387097,160132.0
