In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from pathlib import Path
from sqlalchemy import create_engine
from ds100_utils import fetch_and_cache

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 8)
pd.set_option('precision', 2)

## SQL Joins

In [None]:
import sqlite3
conn = sqlite3.connect('test.db')

In [None]:
conn.executescript("""
DROP TABLE IF EXISTS users;
DROP TABLE IF EXISTS orders;

CREATE TABLE users(
    id INTEGER PRIMARY KEY, 
    name TEXT
);

INSERT INTO users VALUES 
    (1, 'sam'),
    (2, 'manana'),
    (3, 'leo'),
    (4, 'steph');

CREATE TABLE orders(
    item TEXT PRIMARY KEY,
    price NUMERIC,
    name TEXT
);

INSERT INTO orders VALUES 
    ('water', 2, 'caleb'),
    ('tea', 10.5, 'sam'),
    ('latte', 4, 'manana'),
    ('boba', 4.5, 'leo');
""");

In [None]:
def print_sql(s):
    first, *rest = s.strip().split('\n')
    print(f'sql> {first}')
    for line in rest:
        print(f'...> {line}')
    for result in conn.execute(s):
        print(result)

print_sql('SELECT * FROM users;')
print_sql('SELECT * FROM orders;')

## IMDB Data

In [None]:
# From https://www.imdb.com/interfaces/
fetch_and_cache('https://datasets.imdbws.com/title.basics.tsv.gz', 'titles.tsv.gz')
fetch_and_cache('https://datasets.imdbws.com/name.basics.tsv.gz', 'names.tsv.gz')
!gunzip -kf data/titles.tsv.gz
!gunzip -kf data/names.tsv.gz
!ls -lh data

In [None]:
# How to crash your kernel:
# pd.read_csv('data/names.tsv', sep='\t')

In [None]:
# Instead, use sqlite3 (must be run from the terminal)
"""
$ sqlite3 imdb.db
sqlite> .mode tabs
sqlite> .import data/titles.tsv titles
sqlite> .import data/names.tsv names
""";

In [None]:
conn = sqlite3.connect('imdb.db')
print_sql('SELECT * FROM titles LIMIT 10;')

In [None]:
for exp in conn.execute('SELECT sql FROM sqlite_master;'):
    print(exp[0])

In [None]:
key = 'tconst'
title = 'primaryTitle'
time = 'runtimeMinutes'
year = 'startYear'
adult = 'isAdult'
kind = 'titleType'

select_action = f'''
...
'''

create_action_table = f'''
...
'''

...

We can check to see whether this table is small enough to read into pandas:

## Sampling in SQL

In [None]:
three_years = '''
...
'''

cluster_sample = f'''
...
'''

pd.read_sql(f'''
...
''', conn)

## Ages of Fame

In [None]:
pd.read_sql('SELECT * FROM names LIMIT 10;', conn)

What is this SQL query doing?

In [None]:
name = 'primaryName'
known = 'knownForTitles'
profession = 'primaryProfession'
born = 'birthYear'

select_actors = f'''
SELECT {name} AS name, 
  CAST({born} AS int) AS born,
  SUBSTR({known}, 0, INSTR({known}, ',')) AS movie_id,
  CASE WHEN {profession} LIKE '%actor%' THEN 'actor' 
    WHEN {profession} LIKE '%actress%' THEN 'actress'   
    END AS profession
FROM names
WHERE {profession} LIKE '%act%' AND
  born > 1800
'''

pd.read_sql(f'{select_actors} LIMIT 10', conn)

In [None]:
select = f'''
...
'''

conn.executescript(f'''
DROP TABLE IF EXISTS action_ppl;
CREATE TABLE action_ppl AS {select};
''')

pd.read_sql('SELECT * FROM action_ppl LIMIT 10', conn)

In [None]:
pd.read_sql('''
SELECT name, born, year,
  ...
    AS age
FROM action_ppl
''', conn).sample(10)

In [None]:
df = pd.read_sql('SELECT * FROM action_ppl', conn)
df.shape

In [None]:
df['age'] = df['year'] - df['born']
bins = np.linspace(0, 100, 21)
sns.distplot(df[df['profession']=='actor']['age'], bins=bins)
sns.distplot(df[df['profession']=='actress']['age'], bins=bins)
plt.legend(labels=['Actor', 'Actress'])
plt.xticks(bins, rotation=90)
plt.xlim(0, 100)
plt.ylabel('density');
plt.title('Ages of actors in the Action movies they are known for');