# LinkedIn Learning
# Faster Pandas

## 1. Overview

### A. Measuring Performance 

In [2]:
def find_outliers(data):
    # Find Outliers in data, return indices of outliers
    out = data[(data - data.mean()).abs() > 2 * data.std()]
    return out.index

In [3]:
import numpy as np
import pandas as pd

In [5]:
data = pd.Series(np.random.randint(50,60,10_000))

In [6]:
data[7] = 3

In [7]:
data[1003] = 100

In [8]:
find_outliers(data)

Index([7, 1003], dtype='int64')

In [9]:
%timeit find_outliers(data)

458 μs ± 39.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
%timeit?

[1;31mDocstring:[0m
Time execution of a Python statement or expression

Usage, in line mode:
  %timeit [-n<N> -r<R> [-t|-c] -q -p<P> -o] statement
or in cell mode:
  %%timeit [-n<N> -r<R> [-t|-c] -q -p<P> -o] setup_code
  code
  code...

Time execution of a Python statement or expression using the timeit
module.  This function can be used both as a line and cell magic:

- In line mode you can time a single-line statement (though multiple
  ones can be chained with using semicolons).

- In cell mode, the statement in the first line is used as setup code
  (executed but not timed) and the body of the cell is timed.  The cell
  body has access to any variables created in the setup code.

Options:
-n<N>: execute the given statement <N> times in a loop. If <N> is not
provided, <N> is determined so as to get sufficient accuracy.

-r<R>: number of repeats <R>, each consisting of <N> loops, and take the
average result.
Default: 7

-t: use time.time to measure the time, which is the default o

In [11]:
!pip install pytest-benchmark

Collecting pytest-benchmark
  Downloading pytest_benchmark-5.1.0-py3-none-any.whl.metadata (25 kB)
Collecting pytest>=8.1 (from pytest-benchmark)
  Downloading pytest-8.3.4-py3-none-any.whl.metadata (7.5 kB)
Collecting pluggy<2,>=1.5 (from pytest>=8.1->pytest-benchmark)
  Downloading pluggy-1.5.0-py3-none-any.whl.metadata (4.8 kB)
Downloading pytest_benchmark-5.1.0-py3-none-any.whl (44 kB)
Downloading pytest-8.3.4-py3-none-any.whl (343 kB)
Downloading pluggy-1.5.0-py3-none-any.whl (20 kB)
Installing collected packages: pluggy, pytest, pytest-benchmark
  Attempting uninstall: pluggy
    Found existing installation: pluggy 1.0.0
    Uninstalling pluggy-1.0.0:
      Successfully uninstalled pluggy-1.0.0
  Attempting uninstall: pytest
    Found existing installation: pytest 7.4.4
    Uninstalling pytest-7.4.4:
      Successfully uninstalled pytest-7.4.4
Successfully installed pluggy-1.5.0 pytest-8.3.4 pytest-benchmark-5.1.0



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


import pandas as pd
import numpy as np

In [37]:
def gen_data(size, num_outliers):
    # Generate data in with size element containing num_outliers outliers/.
    # returns the data and outliers
    regular = np.random.randint(50, 60, size - num_outliers)
    low = np.random.randint(1, 10, num_outliers//2)
    high = np.random.randint(100,110, num_outliers - len(low))
    data = np.concatenate([regular, low, high])
    np.random.shuffle(data)
    return pd.Series(data), pd.Series(np.concatenate([low, high]))

In [38]:
def test_bench_outlier(benchmark):
    size = 10_000 # Usual size of data
    num_outliers = 5 # Usual number of outiers
    data, expected = gen_data(size, num_outliers)
    indices = benchmark(OUTLIERS, data)
    outliers = data.loc[indices]
    assert set(expected) == set(outliers), 'bad result'

In [49]:
!python -m pytest

platform win32 -- Python 3.12.4, pytest-8.3.4, pluggy-1.5.0
benchmark: 5.1.0 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000)
rootdir: C:\Users\Advait\Desktop\LinkedIn Learning HSBC\Faster Pandas
plugins: anyio-4.2.0, benchmark-5.1.0
collected 1 item

test_outliers.py [32m.[0m[32m                                                       [100%][0m


[33m---------------------------------------------------- benchmark: 1 tests ----------------------------------------------------[0m
Name (time in us)           Min         Max      Mean   StdDev    Median     IQR  Outliers  OPS (Kops/s)  Rounds  Iterations
[33m----------------------------------------------------------------------------------------------------------------------------[0m
test_bench_outlier   [1m  259.4000[0m[1m  1,083.9000[0m[1m  283.1203[0m[1m  52.6713[0m[1m  270.6500[0m[1m  6.2000[0m    61;161[1m  

### B. Profiling

In [50]:
import re 

def stem(word):
    # Return stem of word
    # stem('working') --> work, stem('works') --> work
    return re.sub(r'(sling)$', '', word)

def tokenize(text):
    # Split text to words, ignoring stop words
    tokens = []
    for tok in re.findall('[a-zA-Z]+', text):
        tok = tok.lower()
        tok = stem(tok)
        if tok not in stop_words:
            tokens.append(tok)
    return tokens

stop_words = {
    'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am',
    'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been',
    'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does',
    'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had',
    'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i',
    'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like',
    'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no',
    'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our',
    'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so',
    'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these',
    'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we',
    'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why',
    'will', 'with', 'would', 'yet', 'you', 'your',
}

In [52]:
%run nlp.py

s = 'We will encourage you to develop the three great virtues of a programmer: laziness, impatience, and hubris'

tokenize(s)

['encourage',
 'develop',
 'three',
 'great',
 'virtues',
 'programmer',
 'laziness',
 'impatience',
 'hubris']

In [53]:
%prun tokenize(s)

 

         203 function calls (201 primitive calls) in 0.002 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.001    0.001    0.001    0.001 {method 'disable' of '_lsprof.Profiler' objects}
        1    0.000    0.000    0.000    0.000 {built-in method builtins.exec}
        1    0.000    0.000    0.002    0.002 {built-in method select.select}
        1    0.000    0.000    0.002    0.002 windows_events.py:761(_poll)
        1    0.000    0.000    0.000    0.000 attrsettr.py:65(_get_attr_opt)
        1    0.000    0.000    0.000    0.000 nlp.py:8(tokenize)
        1    0.000    0.000    0.000    0.000 attrsettr.py:42(__getattr__)
    37/35    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
        1    0.000    0.000    0.002    0.002 base_events.py:1910(_run_once)
       18    0.000    0.000    0.000    0.000 __init__.py:280(_compile)
       17    0.000    0.000    0.000    0.000 __init__

In [55]:
%%prun 

for _ in range(10_000):
    tokenize(s)

 

         1160748 function calls (1160727 primitive calls) in 0.780 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    10000    0.168    0.000    0.760    0.000 nlp.py:8(tokenize)
   170000    0.114    0.000    0.367    0.000 __init__.py:179(sub)
   180000    0.108    0.000    0.169    0.000 __init__.py:280(_compile)
   170000    0.095    0.000    0.095    0.000 {method 'sub' of 're.Pattern' objects}
   170000    0.072    0.000    0.439    0.000 nlp.py:3(stem)
    10000    0.064    0.000    0.064    0.000 {method 'findall' of 're.Pattern' objects}
180145/180141    0.060    0.000    0.060    0.000 {built-in method builtins.isinstance}
   170000    0.048    0.000    0.048    0.000 {method 'lower' of 'str' objects}
    90005    0.024    0.000    0.024    0.000 {method 'append' of 'list' objects}
    10000    0.008    0.000    0.082    0.000 __init__.py:209(findall)
      6/3    0.007    0.001    0.235    0.078 {method 'run' of

### C. Challege 1 --> Identify Bottleneck 

In [56]:
import pandas as pd


def second(values):
    """Return second highest value

    >>> second([1, 7, 9, 3, 5])
    7
    """
    top, second = -1, -1
    for value in values:
        if value > top:
            top, second = value, top
        elif value > second:
            second = value
    return second


def median_diff(csv_file):
    df = pd.read_csv(csv_file)
    top1 = df.groupby('id')['price'].max()
    top2 = df.groupby('id')['price'].apply(second)
    diffs = top1 - top2
    return diffs.median()

In [58]:
median_diff('bids.csv')

0.013655841283122139

In [60]:
%prun -s cumulative median_diff('bids.csv')

 

         483288 function calls (483238 primitive calls) in 0.899 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      2/1    0.000    0.000    0.898    0.898 {built-in method builtins.exec}
        1    0.001    0.001    0.898    0.898 <string>:1(<module>)
        1    0.002    0.002    0.618    0.618 4240894245.py:19(median_diff)
        1    0.000    0.000    0.582    0.582 generic.py:224(apply)
        1    0.000    0.000    0.582    0.582 groupby.py:1780(apply)
        1    0.001    0.001    0.582    0.582 groupby.py:1850(_python_apply_general)
        1    0.036    0.036    0.580    0.580 ops.py:897(apply_groupwise)
    10000    0.013    0.000    0.319    0.000 ops.py:1149(__iter__)
    10000    0.026    0.000    0.297    0.000 ops.py:1171(_chop)
        1    0.000    0.000    0.279    0.279 readers.py:868(read_csv)
        1    0.002    0.002    0.183    0.183 readers.py:583(_read)
        1    0.000    0.000    0.

## 2. Vectorization

### A. What is Vectorization?

In [61]:
import pandas as pd

s = pd.Series(range(10_000))

In [63]:
%%timeit
total = 0
for val in s:
    total += val

1.52 ms ± 119 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [65]:
%timeit s.sum()

16 μs ± 1.98 μs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [66]:
1520 / 16

95.0

### B. Boolean Indexing

In [68]:
df = pd.read_csv('cart.csv')

In [69]:
df

Unnamed: 0,Customer,Item,Amount,Item Price
0,Rick,Wine,20,103.2
1,Morty,Almond Milk,1,10.04
2,Summer,Ice Cream,1,8.32
3,Beth,Comb,1,7.3
4,Jerry,Tequila,2,20.34


In [70]:
df['Item Price'] > 10

0     True
1     True
2    False
3    False
4     True
Name: Item Price, dtype: bool

In [71]:
mask = df['Item Price'] > 10

In [72]:
df[mask]

Unnamed: 0,Customer,Item,Amount,Item Price
0,Rick,Wine,20,103.2
1,Morty,Almond Milk,1,10.04
4,Jerry,Tequila,2,20.34


In [73]:
df[df['Item Price'] > 10]

Unnamed: 0,Customer,Item,Amount,Item Price
0,Rick,Wine,20,103.2
1,Morty,Almond Milk,1,10.04
4,Jerry,Tequila,2,20.34


In [74]:
import sqlite3

In [77]:
conn = sqlite3.connect('logs_1.db', detect_types = sqlite3.PARSE_DECLTYPES)

In [80]:
df = pd.read_sql('SELECT * FROM logs', conn)

In [81]:
len(df)

10000

In [82]:
%%timeit
total = 0
for _,row in df.iterrows():
    if row['status_code'] >= 400:
        total += 1

595 ms ± 60.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [83]:
%timeit len(df[df['status_code'] >= 400])

386 μs ± 63 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [84]:
5_950_000/386

15414.507772020725

### B. Understanding ufuncs (Universal Functions)

In [85]:
s = pd.Series(range(10_000))

In [87]:
%timeit max(s)

1.39 ms ± 117 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [88]:
%timeit s.max()

24.1 μs ± 4.13 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [89]:
1390 / 24.1

57.676348547717836

### C. Challenge 2 --> Selecting and Manipulating Data

In [90]:
"""Find last time we had error in logs"""

import sqlite3
from contextlib import closing

import pandas as pd


def last_error_time(df):
    """Find last time there's an error in df"""
    last_time = None
    for _, row in df.iterrows():
        if row['status_code'] < 400:
            continue
        if not last_time or row['time'] > last_time:
            last_time = row['time']
    return last_time


def load_df(db_file):
    """Load DataFrame from database"""
    conn = sqlite3.connect(db_file, detect_types=sqlite3.PARSE_DECLTYPES)
    with closing(conn):
        return pd.read_sql('SELECT * FROM logs', conn)

In [92]:
df = load_df('logs_2.db')

In [93]:
df

Unnamed: 0,time,origin,method,path,status_code,size
0,1995-08-01 00:00:01,in24.inetnebr.com,GET,/shuttle/missions/sts-68/news/sts-68-mcc-05.txt,200,1839
1,1995-08-01 00:00:07,uplherc.upl.com,GET,/,304,0
2,1995-08-01 00:00:08,uplherc.upl.com,GET,/images/ksclogo-medium.gif,304,0
3,1995-08-01 00:00:08,uplherc.upl.com,GET,/images/MOSAIC-logosmall.gif,304,0
4,1995-08-01 00:00:08,uplherc.upl.com,GET,/images/USA-logosmall.gif,304,0
...,...,...,...,...,...,...
9995,1995-08-01 07:54:40,ppp-14.flashnet.it,GET,/images/USA-logosmall.gif,200,234
9996,1995-08-01 07:54:41,204.238.216.51,GET,/images/ksclogo-medium.gif,304,0
9997,1995-08-01 07:54:42,204.238.216.51,GET,/images/USA-logosmall.gif,304,0
9998,1995-08-01 07:54:42,204.238.216.51,GET,/images/MOSAIC-logosmall.gif,304,0


In [94]:
%timeit last_error_time(df)

577 ms ± 28.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [95]:
"""Find last time we had error in logs"""

import sqlite3
from contextlib import closing

import pandas as pd


def last_error_time(df):
    """Find last time there's an error in df"""
    return df[df['status_code'] >= 400]['time'].max()


def load_df(db_file):
    """Load DataFrame from database"""
    conn = sqlite3.connect(db_file, detect_types=sqlite3.PARSE_DECLTYPES)
    with closing(conn):
        return pd.read_sql('SELECT * FROM logs', conn)

In [96]:
df_2 = load_df('logs_2.db')

In [97]:
df_2

Unnamed: 0,time,origin,method,path,status_code,size
0,1995-08-01 00:00:01,in24.inetnebr.com,GET,/shuttle/missions/sts-68/news/sts-68-mcc-05.txt,200,1839
1,1995-08-01 00:00:07,uplherc.upl.com,GET,/,304,0
2,1995-08-01 00:00:08,uplherc.upl.com,GET,/images/ksclogo-medium.gif,304,0
3,1995-08-01 00:00:08,uplherc.upl.com,GET,/images/MOSAIC-logosmall.gif,304,0
4,1995-08-01 00:00:08,uplherc.upl.com,GET,/images/USA-logosmall.gif,304,0
...,...,...,...,...,...,...
9995,1995-08-01 07:54:40,ppp-14.flashnet.it,GET,/images/USA-logosmall.gif,200,234
9996,1995-08-01 07:54:41,204.238.216.51,GET,/images/ksclogo-medium.gif,304,0
9997,1995-08-01 07:54:42,204.238.216.51,GET,/images/USA-logosmall.gif,304,0
9998,1995-08-01 07:54:42,204.238.216.51,GET,/images/MOSAIC-logosmall.gif,304,0


In [98]:
%timeit last_error_time(df_2)

537 μs ± 32.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### 3. Common Mistakes

### A. The Limitations of Appending

In [99]:
"""Parse log lines"""

from datetime import datetime


def parse_time(ts):
    # [02/Jul/1995:16:30:08 -0400]
    time = datetime.strptime(ts, '[%d/%b/%Y:%H:%M:%S %z]')
    return time.replace(tzinfo=None)  # Remove time zone


def parse_line(line):
    fields = line.split()
    size = 0 if fields[-1] == '-' else int(fields[-1])
    return {
        'origin': fields[0],
        'time': parse_time(fields[3] + ' ' + fields[4]),
        'method': fields[5][1:],  # Remove leading "
        'path': fields[6],
        'status_code': int(fields[-2]),
        'size': size,
    }

In [100]:
import lzma

In [103]:
with lzma.open('log.txt.xz', 'rt') as fp:
    lines = fp.readlines()

In [104]:
len(lines)

1000

In [105]:
df = pd.DataFrame()

In [107]:
for line in lines:
    df = pd.concat([df, pd.DataFrame([parse_line(line)])], ignore_index=True)

In [109]:
%%timeit
df = pd.DataFrame()
for line in lines:
    df = pd.concat([df, pd.DataFrame([parse_line(line)])], ignore_index=True)

963 ms ± 89.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [110]:
df = pd.DataFrame.from_records(parse_line(line) for line in lines)

In [111]:
%%timeit
df = pd.DataFrame.from_records(parse_line(line) for line in lines)

26.9 ms ± 2.63 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [112]:
963 / 26.9

35.79925650557621

### B. The limitations of object type

In [113]:
df = pd.read_csv('logs_1.csv.xz')

In [114]:
df.dtypes

origin         object
date           object
time           object
method         object
path           object
status_code     int64
size            int64
dtype: object

In [115]:
df[['date', 'time']].head()

Unnamed: 0,date,time
0,1995-07-01,00:00:01
1,1995-07-01,00:00:06
2,1995-07-01,00:00:09
3,1995-07-01,00:00:11
4,1995-07-01,00:00:11


In [116]:
df['time'].nunique()

6533

In [117]:
%timeit df['time'].nunique()

1.44 ms ± 99.3 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [118]:
df['time_ts'] = pd.to_datetime(df['time'])

  df['time_ts'] = pd.to_datetime(df['time'])


In [120]:
%timeit df['time_ts'].nunique()

343 μs ± 42.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [121]:
1440 / 343

4.198250728862973

### C. The limitations of Row Iterations

In [122]:
size = 50_000

df = pd.DataFrame({
    'a' : np.random.randint(1, 1000, size),
    'b' : np.random.randint(1, 1000, size),
    'c' : np.random.randint(1, 1000, size)
})

In [123]:
total = 0

for _, row in df.iterrows():
    total += row.max()

In [124]:
total

37475182

In [126]:
%%time
total = 0

for _, row in df.iterrows():
    total += row.max()

CPU times: total: 2.64 s
Wall time: 2.77 s


In [127]:
df.apply(np.max, axis=1).sum()

37475182

In [131]:
%%time 
df.apply(np.max, axis=1).sum()

CPU times: total: 1.2 s
Wall time: 1.25 s


37475182

In [129]:
df.apply(np.max, axis=1, raw = True).sum()

37475182

In [130]:
%time df.apply(np.max, axis=1, raw = True).sum()

CPU times: total: 375 ms
Wall time: 398 ms


37475182

In [132]:
2640 / 398

6.633165829145729

### D. Understanding the isin function

In [133]:
import pandas as pd
import sqlite3

conn = sqlite3.connect('logs_3.db', detect_types = sqlite3.PARSE_DECLTYPES)

df = pd.read_sql('SELECT * FROM logs', conn)

In [134]:
import lzma

In [135]:
with lzma.open('ips.txt.xz', 'rt') as fp:
    ips = [line.strip() for line in fp]

In [136]:
len(ips)

100000

In [137]:
%timeit df['origin'].isin(ips)

27.1 ms ± 1.81 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [138]:
type(ips)

list

In [139]:
ip = df['origin'][3923]

In [140]:
%timeit ip in ips

1.86 ms ± 24.8 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [141]:
ips_set = set(ips)

In [142]:
%timeit ip in ips_set

43.1 ns ± 2.76 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [143]:
1860000 / 43.1

43155.45243619489

In [144]:
%timeit df['origin'].isin(ips_set)

51.2 ms ± 1.78 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [146]:
%timeit df['origin'].apply(lambda v : v in ips_set)

2.52 ms ± 356 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [147]:
ip in ips_set

False

In [148]:
ips_set.__contains__(ip)

False

In [150]:
is_bad = ips_set.__contains__

In [151]:
%timeit df['origin'].apply(is_bad)

1.93 ms ± 240 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [152]:
27.1 / 1.93

14.041450777202074

### E. Parsing time once

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('logs_2.csv.xz')

In [3]:
len(df)

50000

In [4]:
df.sample(5)

Unnamed: 0,origin,time,method,path,status_code,size
26626,kip-1-sn-364.dartmouth.edu,1995-08-18T12:57:22,GET,/icons/image.xbm,200,509
23425,ix-tam1-09.ix.netcom.com,1995-08-16T18:23:33,GET,/shuttle/technology/images/srb_mod_compare_6-s...,200,28219
27094,akiko.bms.com,1995-08-18T16:21:23,GET,/images/ksclogosmall.gif,200,3635
2830,129.126.75.123,1995-08-04T09:08:27,GET,/icons/menu.xbm,200,527
10571,n1028246.ksc.nasa.gov,1995-08-09T10:37:01,GET,/images/ksclogo-medium.gif,200,5866


In [5]:
df['time'][48]

'1995-08-01T00:49:20'

In [6]:
def is_morning(ts):
    t = pd.to_datetime(ts)
    return t.hour >= 6 and t.hour < 12

In [7]:
len(df[df['time'].apply(is_morning)])

13004

In [8]:
%timeit len(df[df['time'].apply(is_morning)])

32 s ± 2.6 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
df_1 = pd.read_csv('logs_2.csv.xz', parse_dates = ['time'])
len(df_1)
df_1.sample(5)

Unnamed: 0,origin,time,method,path,status_code,size
48681,silver.sms.fi,1995-08-31 13:02:14,GET,/shuttle/countdown/images/countclock.gif,304,0
26269,163.205.12.123,1995-08-18 10:11:00,GET,/images/USA-logosmall.gif,200,234
6276,dal04-14.ppp.iadfw.net,1995-08-07 00:57:58,GET,/images/NASA-logosmall.gif,200,786
33628,198.83.140.55,1995-08-23 07:37:27,GET,/software/winvn/bluemarb.gif,304,0
46003,163.206.89.4,1995-08-30 14:40:28,GET,/images/MOSAIC-logosmall.gif,200,363


In [10]:
len(df_1[(df_1['time'].dt.hour >= 6) & (df_1['time'].dt.hour < 12)])

13004

In [11]:
%timeit len(df_1[(df_1['time'].dt.hour >= 6) & (df_1['time'].dt.hour < 12)])

7.4 ms ± 466 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
32000 / 7.4

4324.324324324324

### F. Challenge - 3 --> Querying a DataFrame

In [29]:
"""Find how many rides in 2016 were in the afternoon of weekend or holiday.

- Afternoon: Between noon to 6pm
- Weekend: Saturday or Sunday
- Holiday: See holidays_2016 below
"""

import pandas as pd

# 2016 public holidays
holidays_2016 = [
    '2016-01-01',  # new year
    '2016-01-18',  # MLK
    '2016-05-30',  # memorial
    '2016-07-04',  # independence
    '2016-09-05',  # labor
    '2016-11-11',  # veterans
    '2016-11-24',  # thanksgiving
    '2016-12-26',  # christmas
]


def load_df(file_name):
    """Load data from CSV to DataFrame"""
    return pd.read_csv(file_name)


def is_2016(s):
    ts = pd.to_datetime(s)

    return ts.year == 2016


def is_weekend(s):
    """Check that s in a weekend day"""
    ts = pd.to_datetime(s)

    return ts.day_name() == 'Saturday' or ts.day_name() == 'Sunday'


def is_holiday(s):
    """Check that s (e.g. '10/26/2014') is a holiday"""
    ts = pd.to_datetime(s)

    day = ts.strftime('%Y-%m-%d')  # holidays_2016 format
    return day in holidays_2016


def is_afternoon(s):
    """Check that s (e.g. '13:12:00' is in the afternoon"""
    ts = pd.to_datetime(s)

    return ts.hour >= 12 and ts.hour < 18


def vacation_rides(df):
    """Return only rows that are in holiday afternoon"""
    result = pd.DataFrame()  # Initialize an empty DataFrame
    
    for _, row in df.iterrows():
        date, time = row['Checkout Date'], row['Checkout Time']
        
        if not is_2016(date):
            continue

        if (is_holiday(date) or is_weekend(date)) and is_afternoon(time):
            row_df = row.to_frame().T  # Convert row (Series) to DataFrame
            result = pd.concat([result, row_df], ignore_index=True)  # Append row
    
    return result

In [30]:
df = load_df('austin-bikes.csv.xz')

In [31]:
len(vacation_rides(df))

421

In [32]:
%time _ = vacation_rides(df)

CPU times: total: 8.97 s
Wall time: 9.19 s


In [33]:
"""Find how many rides in 2016 were in the afternoon of weekend or holiday.

- Afternoon: Between noon to 6pm
- Weekend: Friday or Saturday
- Holiday: See holidays_2016 below
"""

from calendar import SATURDAY, SUNDAY

import pandas as pd

# 2016 public holidays
holidays_2016 = pd.to_datetime([
    '2016-01-01',  # new year
    '2016-01-18',  # MLK
    '2016-05-30',  # memorial
    '2016-07-04',  # independence
    '2016-09-05',  # labor
    '2016-11-11',  # veterans
    '2016-11-24',  # thanksgiving
    '2016-12-26',  # christmas
])


def load_df(file_name):
    """Load data from CSV to DataFrame"""
    return pd.read_csv(
        file_name,
        parse_dates={'time': ['Checkout Date', 'Checkout Time']},
    )


def vacation_rides(df):
    """Return only rows that are in holiday afternoon"""
    mask_2016 = df['time'].dt.year == 2016

    holiday_mask = (
        (df['time'].dt.floor('d').isin(holidays_2016)) |
        (df['time'].dt.weekday.isin([SATURDAY, SUNDAY]))
    )

    afternoon_mask = (df['time'].dt.hour >= 12) & (df['time'].dt.hour < 18)

    return df[mask_2016 & holiday_mask & afternoon_mask]

In [34]:
df = load_df('austin-bikes.csv.xz')

  return pd.read_csv(


In [35]:
len(vacation_rides(df))

421

In [37]:
%timeit vacation_rides(df)

3.91 ms ± 679 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [38]:
9000 / 3.91

2301.7902813299233

## 4. Pandas Performance 

### A. Using Built In functions

In [39]:
df = pd.read_csv('austin-bikes.csv.xz', low_memory = False)

In [40]:
max(df['Trip Duration Minutes'])

11553

In [41]:
Out[40] / (24*60)

8.022916666666667

In [42]:
df['Trip Duration Minutes'].max()

11553

In [43]:
%timeit max(df['Trip Duration Minutes'])

1.13 ms ± 62.4 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [44]:
%timeit df['Trip Duration Minutes'].max()

26.4 μs ± 4.75 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [45]:
11300 / 26.4

428.03030303030306

In [46]:
%timeit df['Trip Duration Minutes'].values.max()

9.93 μs ± 1.22 μs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [47]:
11300 / 9.93

1137.9657603222558

### B. Understanding eval and query 

In [49]:
!pip install numexpr



In [52]:
import sqlite3
conn = sqlite3.connect('logs_4db.db', detect_types = sqlite3.PARSE_DECLTYPES)

In [53]:
df = pd.read_sql('SELECT * FROM logs', conn)

In [54]:
len(df)

10000

In [55]:
df = pd.concat([df] * 1000)

In [56]:
len(df)

10000000

In [59]:
f'{len(df) :,}'

'10,000,000'

In [60]:
%timeit df[(df['method'] == 'GET') & (df['status_code'] >= 400)]

867 ms ± 57.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [61]:
%timeit df.query('method == "GET" & status_code >= 400')

498 ms ± 21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [62]:
498 / 867

0.5743944636678201

### C. Understanding join function

In [63]:
df = pd.read_csv('taxi.csv.xz')

In [64]:
df.sample(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
10788,1,2018-05-01 03:43:36,2018-05-01 03:51:08,1,3.4,1,N,141,7,2,11.0,0.5,0.5,0.0,0.0,0.3,12.3
120329,2,2018-05-01 14:00:03,2018-05-01 14:15:18,1,1.68,1,N,125,164,1,11.0,0.0,0.5,1.2,0.0,0.3,13.0
406064,1,2018-05-02 11:58:22,2018-05-02 12:09:04,1,1.1,1,N,186,161,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
332614,2,2018-05-02 06:32:46,2018-05-02 06:40:22,1,1.33,1,N,100,161,2,7.0,0.0,0.5,0.0,0.0,0.3,7.8
376129,2,2018-05-02 09:47:41,2018-05-02 09:55:50,1,0.94,1,N,100,163,1,7.0,0.0,0.5,1.56,0.0,0.3,9.36


In [65]:
ndf = pd.DataFrame([
    (1, 'Creative'),
    (2, 'Verifone'),
    (3, 'BigApple')
], columns = ['id', 'Vendor'])

In [66]:
ndf

Unnamed: 0,id,Vendor
0,1,Creative
1,2,Verifone
2,3,BigApple


In [67]:
pd.merge(df, ndf, left_on  = 'VendorID', right_on = 'id', how = 'left')

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,id,Vendor
0,1,2018-05-01 00:13:56,2018-05-01 00:22:46,1,1.60,1,N,230,50,1,8.0,0.5,0.5,1.85,0.00,0.3,11.15,1.0,Creative
1,1,2018-05-01 00:23:26,2018-05-01 00:29:56,1,1.70,1,N,263,239,1,7.5,0.5,0.5,2.00,0.00,0.3,10.80,1.0,Creative
2,1,2018-05-01 00:36:23,2018-05-01 00:48:26,2,2.60,1,N,239,152,1,12.0,0.5,0.5,1.00,0.00,0.3,14.30,1.0,Creative
3,1,2018-05-01 00:26:12,2018-05-01 00:27:05,1,0.00,1,N,145,145,1,2.5,0.5,0.5,9.63,0.00,0.3,13.43,1.0,Creative
4,1,2018-05-01 00:29:51,2018-05-01 00:30:02,1,0.00,1,N,145,145,2,2.5,0.5,0.5,0.00,0.00,0.3,3.80,1.0,Creative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499994,2,2018-05-02 17:27:40,2018-05-02 17:33:37,1,0.65,1,N,233,229,1,5.5,1.0,0.5,1.46,0.00,0.3,8.76,2.0,Verifone
499995,2,2018-05-02 17:36:29,2018-05-02 17:48:23,1,2.00,1,N,229,224,1,9.5,1.0,0.5,2.26,0.00,0.3,13.56,2.0,Verifone
499996,2,2018-05-02 17:56:43,2018-05-02 18:07:26,1,1.62,1,N,137,4,2,9.0,1.0,0.5,0.00,0.00,0.3,10.80,2.0,Verifone
499997,2,2018-05-02 17:33:05,2018-05-02 18:38:26,1,18.70,2,N,132,50,1,52.0,4.5,0.5,12.61,5.76,0.3,75.67,2.0,Verifone


In [68]:
%timeit pd.merge(df, ndf, left_on  = 'VendorID', right_on = 'id', how = 'left')

94.1 ms ± 3.79 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [69]:
ndfi = ndf.set_index('id')

In [71]:
%timeit pd.merge(df, ndfi, left_on  = 'VendorID', right_index = True, how = 'left')

78.8 ms ± 3.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [72]:
94.1 / 78.8

1.1941624365482233

### Challenge 4 --> Join and Query

In [73]:
"""What is the median trip duration in 2017, only in active kiosks?

- Trip data in austin-bikes.csv.xz
- Kiosk status data in austin-kiosk.csv
"""

import pandas as pd

bike_df = pd.read_csv('austin-bikes.csv.xz', low_memory=False)

# Set index to 'Kiosk ID' for faster merge
kiosk_df = pd.read_csv('austin-kiosk.csv', index_col='Kiosk ID')
df = pd.merge(
    bike_df, kiosk_df, left_on='Checkout Kiosk ID', right_index=True)

# Use query for selecting data
active_2017 = df.query(
    '`Kiosk Status` == "active" & Year == 2017 & `Trip Duration Minutes` > 0')

# Use built-in median
print(active_2017['Trip Duration Minutes'].median())

16.0
