/
flights.py
83 lines (71 loc) · 2.91 KB
/
flights.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# -*- coding: utf-8 -*-
###############################################################################
# "Flights data" example
###############################################################################
# Objectives:
# To learn some of the most common operations used in pandas
# Sources:
# https://tomaugspurger.github.io/dplry-pandas.html
# https://github.com/hadley/nycflights13/blob/master/data-raw/airports.csv
# https://github.com/rich-iannone/so-many-pyspark-examples/blob/master/data-files/nycflights13.csv
###############################################################################
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Import required libraries
import pandas as pd
import numpy as np
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Read data
flights = pd.read_csv("../data/Extracted/Flights/nycflights13.csv")
flights.shape
flights.head()
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Querying with specific constrains
flights.query("month == 1 & day == 1")
flights[(flights.month == 1) & (flights.day == 1)]
flights[(flights["month"] == 1) & (flights["day"] == 1)]
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Querying by location
flights.iloc[2:9]
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Sort by key
flights.sort_values(['year', 'month', 'day'])
flights[['year', 'month', 'day']]
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Getting the unique elements in a column (categories)
flights.tailnum.unique()
flights[['origin', 'dest']].drop_duplicates()
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Adding categories
flights['gain'] = flights.arr_delay - flights.dep_delay
flights['speed'] = flights.distance / flights.air_time * 60
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Getting basic statistics
flights.dep_delay.mean()
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Random sampling
selection = np.random.choice(flights.index, 10)
flights.loc[selection]
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Aggregated statistics
len(flights)
flights.head()
planes = flights.groupby(["tailnum"], axis=0).mean()
len(planes)
planes.head()
delay = planes.agg({
"year": "count",
"distance": "median",
"arr_delay": "mean"
})
delay.query("year > 20 & distance < 2000")
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Aggregated statistics
daily = flights.groupby(['year', 'month', 'day'])
per_day = daily['distance'].count()
per_month = per_day.groupby(level=['year', 'month']).sum()
(
flights.groupby(['year', 'month', 'day'])
[['arr_delay', 'dep_delay']]
.mean()
.query('arr_delay > 30 | dep_delay > 30')
)