In [1]:
import pandas as pd
import numpy as np
import psutil, os
from dask import delayed
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('960906061_T_ONTIME_REPORTING.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607346 entries, 0 to 607345
Data columns (total 38 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   YEAR                   607346 non-null  int64  
 1   QUARTER                607346 non-null  int64  
 2   MONTH                  607346 non-null  int64  
 3   DAY_OF_MONTH           607346 non-null  int64  
 4   DAY_OF_WEEK            607346 non-null  int64  
 5   FL_DATE                607346 non-null  object 
 6   OP_UNIQUE_CARRIER      607346 non-null  object 
 7   OP_CARRIER_AIRLINE_ID  607346 non-null  int64  
 8   OP_CARRIER             607346 non-null  object 
 9   TAIL_NUM               606648 non-null  object 
 10  OP_CARRIER_FL_NUM      607346 non-null  int64  
 11  ORIGIN_AIRPORT_ID      607346 non-null  int64  
 12  ORIGIN_AIRPORT_SEQ_ID  607346 non-null  int64  
 13  ORIGIN_CITY_MARKET_ID  607346 non-null  int64  
 14  ORIGIN                 607346 non-nu

In [4]:
df.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,...,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,Unnamed: 37
0,2020,1,1,1,3,2020-01-01,WN,19393,WN,N951WN,...,1810,1851.0,41.0,41.0,1.0,2.0,1800-1859,44.0,1935.0,
1,2020,1,1,1,3,2020-01-01,WN,19393,WN,N467WN,...,1150,1146.0,-4.0,0.0,0.0,-1.0,1100-1159,13.0,1159.0,
2,2020,1,1,1,3,2020-01-01,WN,19393,WN,N7885A,...,2020,2016.0,-4.0,0.0,0.0,-1.0,2000-2059,8.0,2024.0,
3,2020,1,1,1,3,2020-01-01,WN,19393,WN,N551WN,...,1340,1350.0,10.0,10.0,0.0,0.0,1300-1359,10.0,1400.0,
4,2020,1,1,1,3,2020-01-01,WN,19393,WN,N968WN,...,915,916.0,1.0,1.0,0.0,0.0,0900-0959,6.0,922.0,


In [5]:
df.memory_usage().sum() / (1024 ** 2)

176.08004760742188

In [6]:
celsius = np.random.rand(1000, 1000)

In [7]:
celsius

array([[0.8218417 , 0.46873945, 0.5164466 , ..., 0.74618296, 0.34510967,
        0.65692463],
       [0.945834  , 0.48057701, 0.54548152, ..., 0.0689557 , 0.12938433,
        0.25761851],
       [0.46879176, 0.06588557, 0.89647648, ..., 0.84712116, 0.8743944 ,
        0.51592643],
       ...,
       [0.14269983, 0.60451097, 0.43414542, ..., 0.44518526, 0.46215587,
        0.33331249],
       [0.00459644, 0.85779986, 0.28066126, ..., 0.07300285, 0.1970744 ,
        0.8437099 ],
       [0.66897764, 0.96386313, 0.28092108, ..., 0.92076278, 0.14165136,
        0.556019  ]])

In [8]:
def memory_footprint():
    return psutil.Process(os.getpid()).memory_info().rss / 1024**2

In [9]:
# Print the size in MB of the celsius array
print(celsius.nbytes / (1024 ** 2))

# Call memory_footprint(): before
before = memory_footprint()

# Convert celsius by multiplying by 9/5 and adding 32: fahrenheit
fahrenheit = celsius * 9/5 +32

# Call memory_footprint(): after
after = memory_footprint()

# Print the difference between after and before
print(after - before)

7.62939453125
0.0


In [10]:
# Create empty list: dfs
dfs = []

# Loop over 'WDI.csv'
for chunk in pd.read_csv('./WDI_csv/WDIData.csv', chunksize=1000):
    # Create the first Series
    contains_urban = chunk['Indicator Name'].str.contains('Urban')
    # Create the second Series
    is_AUS = chunk['Country Code']=='AUS'

    # Create the filtered chunk: filtered
    filtered = chunk.loc[contains_urban & is_AUS]

    # Append the filtered chunk to the list dfs
    dfs.append(filtered)

In [11]:
# Print length of list dfs
print(len(dfs))

# Apply pd.concat to dfs: df
df = pd.concat(dfs)

# Print length of DataFrame df
print(df)


378
      Country Name Country Code  \
82966    Australia          AUS   
82967    Australia          AUS   
82968    Australia          AUS   
82969    Australia          AUS   
82970    Australia          AUS   
82971    Australia          AUS   
82972    Australia          AUS   

                                          Indicator Name     Indicator Code  \
82966                           Urban land area (sq. km)  AG.LND.TOTL.UR.K2   
82967  Urban land area where elevation is below 5 met...  AG.LND.EL5M.UR.ZS   
82968  Urban land area where elevation is below 5 met...  AG.LND.EL5M.UR.K2   
82969                                   Urban population        SP.URB.TOTL   
82970           Urban population (% of total population)  SP.URB.TOTL.IN.ZS   
82971                 Urban population growth (annual %)        SP.URB.GROW   
82972  Urban population living in areas where elevati...  EN.POP.EL5M.UR.ZS   

              1960          1961          1962          1963          1964  \
8296

In [13]:
# Define function with single input called df: pct_delayed
def pct_delayed(df):
    # Compute number of delayed flights: n_delayed
    n_delayed = (df['DEP_DELAY'] > 0).sum()
    # Return percentage of delayed flights
    return 100  * n_delayed / len(df)

In [14]:
# Define the generator: dataframes
dataframes = (pd.read_csv(file) for file in ['960906061_T_ONTIME_REPORTING.csv'])

# Create the list comprehension: monthly_delayed
monthly_delayed = [pct_delayed(df) for df in dataframes]

# Create the plot
# x = range(1,13)
# plt.plot(x, monthly_delayed, marker='o', linewidth=0)
# plt.ylabel('% Delayed')
# plt.xlabel('Month - 2016')
# plt.xlim((1,12))
# plt.ylim((0,100))
# plt.show()

In [15]:
# Define count_flights
@delayed
def count_flights(df):
    return len(df)

# Define count_delayed
@delayed
def count_delayed(df):
    return (df['DEP_DELAY']>0).sum()

# Define pct_delayed
@delayed
def pct_delayed(n_delayed, n_flights):
    return 100 * sum(n_delayed) / sum(n_flights)

In [16]:
@delayed
def read_one(filename):
    return pd.read_csv(filename)

In [17]:
n_delayed = []
n_flights = []
# Loop over the provided filenames list and call read_one: df
for file in ['960906061_T_ONTIME_REPORTING.csv']:
    df = read_one(file)

    # Append to n_delayed and n_flights
    n_delayed.append(count_delayed(df))
    n_flights.append(count_flights(df))

# Call pct_delayed with n_delayed and n_flights: result
result = pct_delayed(n_delayed, n_flights)

# Print the output of result.compute()
print(result.compute())

27.57801977785315
