# Setup

In [2]:
import os
import pandas as pd
import csv
import sys
import numpy as np
from PIL import Image, ImageColor
import asyncio
from matplotlib.pyplot import imshow
csv.field_size_limit(sys.maxsize)

131072

In [6]:
!pip install memory_profiler

Collecting memory_profiler
  Downloading https://files.pythonhosted.org/packages/8f/fd/d92b3295657f8837e0177e7b48b32d6651436f0293af42b76d134c3bb489/memory_profiler-0.58.0.tar.gz
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25l[?25hdone
  Created wheel for memory-profiler: filename=memory_profiler-0.58.0-cp37-none-any.whl size=30180 sha256=f7b117d2e4cee67fa0c6237b963df9326763accf2e7487bf8b1f795a89ffa76f
  Stored in directory: /root/.cache/pip/wheels/02/e4/0b/aaab481fc5dd2a4ea59e78bc7231bb6aae7635ca7ee79f8ae5
Successfully built memory-profiler
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.58.0


In [7]:
import memory_profiler
import time

def time_mem_decorator(func):                                                                                            
    def out(*args, **kwargs):                                                                                            
        m1 = memory_profiler.memory_usage()
        t1 = time.time()
        
        result = func(*args, **kwargs)
        
        t2 = time.time()
        m2 = memory_profiler.memory_usage()
        time_diff = t2 - t1
        mem_diff = m2[0] - m1[0]
        print(f"It took {time_diff} Secs and {mem_diff} Mb to execute this function.")
        return(result)
    return out  

# Vectorisatie

In [None]:
@time_mem_decorator
def get_results_fast(x,y):
  return np.vectorize(complicated_calculation)(x,y)

In [None]:
x = np.random.randn(int(1e6))
y = np.random.randn(int(1e6))

In [None]:
res_fast = get_results_fast(x, y)

It took 7.43865966796875e-05 Secs and 0.0 Mb to execute this function.


# Generators

In [None]:
!curl -L -c cookies.txt 'https://docs.google.com/uc?export=download&id=1DhyJdebnB6zwV5Jce1TgTO8PwfNtwn7P' | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1/p' > confirm.txt
!curl -L -b cookies.txt -o 'en-books-dataset.zip' 'https://docs.google.com/uc?export=download&id=1DhyJdebnB6zwV5Jce1TgTO8PwfNtwn7P&confirm='$(<confirm.txt)
!unzip en-books-dataset.zip
!rm -f confirm.txt cookies.txt en-books-dataset.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  3284    0  3284    0     0  11728      0 --:--:-- --:--:-- --:--:-- 11728
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   408    0   408    0     0   2385      0 --:--:-- --:--:-- --:--:--  2372
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  481M    0  481M    0     0  35.5M      0 --:--:--  0:00:13 --:--:-- 51.2M
Archive:  en-books-dataset.zip
  inflating: en-books-dataset.csv    


In [None]:
def fast_csv_reader(filename):
    with open(filename, 'r') as csv_file:
        for row in csv.reader(csv_file):
            yield row

In [None]:
print(next(iter(fast_csv_reader('en-books-dataset.csv'))))

['title', 'url', 'abstract', 'body_text', 'body_html']


In [None]:
@time_mem_decorator
def fast_row_count(filename):
    rows = 0
    for row in fast_csv_reader(filename):
        rows += 1
    # -1 because the reader includes the headers.
    return f"There are {rows - 1} rows in the csv file."

In [None]:
fast_row_count('en-books-dataset.csv')

It took 47.35005974769592 Secs and 39.2421875 Mb to execute this function.


'There are 82258 rows in the csv file.'

# Slimme queries

In [1]:
!curl -L -c cookies.txt 'https://docs.google.com/uc?export=download&id=1ZYsnbhj523vgEiwGNNrqhMHQ-YoHhcq6' | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1/p' > confirm.txt    
!curl -L -b cookies.txt -o 'house_sales_UK.zip' 'https://docs.google.com/uc?export=download&id=1ZYsnbhj523vgEiwGNNrqhMHQ-YoHhcq6&confirm='$(<confirm.txt)
!unzip house_sales_UK.zip
!rm -f confirm.txt cookies.txt house_sales_UK.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  3278    0  3278    0     0  15535      0 --:--:-- --:--:-- --:--:-- 15535
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   408    0   408    0     0   2266      0 --:--:-- --:--:-- --:--:--  2266
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  213M    0  213M    0     0  61.8M      0 --:--:--  0:00:03 --:--:-- 85.1M
Archive:  house_sales_UK.zip
  inflating: sales_locations.csv     
  inflating: sales_records.csv       


In [3]:
locations = pd.read_csv('sales_locations.csv',index_col=0)
records = pd.read_csv('sales_records.csv',index_col=0)

In [8]:
@time_mem_decorator
def quick_query(locations, records, city='YORK'):
    join = locations[locations['Town/City'] == city].join(records)
    return join['Price'].mean()

In [9]:
quick_query(locations,records, city='YORK')

It took 4.992720127105713 Secs and 0.3046875 Mb to execute this function.


253093.69580430503