In [None]:
-- Crunchbase Dataset
-- Write a query that appends the two crunchbase_investments datasets above (including duplicate values). 
-- Filter the first dataset to only companies with names that start with the letter "T", and filter the second 
-- to companies with names starting with "M" (both not case-sensitive). Only include the company_permalink, company_name, and investor_name columns.

SELECT company_permalink,
       company_name,
       investor_name
FROM tutorial.crunchbase_investments_part1
WHERE company_name ILIKE 'T%'
UNION ALL
SELECT company_permalink,
       company_name,
       investor_name
FROM tutorial.crunchbase_investments_part1
WHERE company_name ILIKE 'M%'

In [None]:
-- Alternatively
SELECT company_permalink,
       company_name,
       investor_name
FROM tutorial.crunchbase_investments_part1
WHERE   company_name LIKE 'T%'
        OR
        company_name LIKE 't%'
UNION ALL
SELECT company_permalink,
       company_name,
       investor_name
FROM tutorial.crunchbase_investments_part1
WHERE   company_name LIKE 'M%'
        OR
        company_name LIKE 'm%'

Write a query that shows 3 columns. The first indicates which dataset (part 1 or 2) the data comes from, the second shows company status, and the third is a count of the number of investors.

Hint: you will have to use the tutorial.crunchbase_companies table as well as the investments tables. And you'll want to group by status and dataset.

In [None]:
SELECT 'Set_1' AS dataset_name,
       companies.status,
       COUNT(DISTINCT(investments.investor_permalink)) AS investors
FROM tutorial.crunchbase_companies companies
LEFT JOIN tutorial.crunchbase_investments_part1 investments ON companies.permalink = investments.company_permalink
GROUP BY dataset_name,
         companies.status
UNION ALL
SELECT 'Set_2' AS dataset_name,
       companies.status,
       COUNT(DISTINCT(investments.investor_permalink)) AS investors
FROM tutorial.crunchbase_companies companies
LEFT JOIN tutorial.crunchbase_investments_part2 investments ON companies.permalink = investments.company_permalink
GROUP BY dataset_name,
         companies.status

Write a query that includes a column that is flagged "yes" when a player is from California, and sort the results with those players first.

In [None]:
SELECT player_name,
       CASE
           WHEN state = 'CA' THEN 'yes'
           ELSE 'no'
       END AS from_cali
FROM benn.college_football_players

Write a query that selects all columns from benn.college_football_players and adds an additional column that displays the player's name if that player is a junior or senior.

In [None]:
SELECT *,
       CASE
           WHEN year = 'JR' THEN player_name
           WHEN year = 'SR' THEN player_name
           ELSE NULL
       END AS upper_classmen
FROM benn.college_football_players

In [None]:
-- Alternatively 
SELECT *,
       CASE
           WHEN year IN ('JR',
                         'SR') THEN player_name
           ELSE NULL
       END AS upperclassmen
FROM benn.college_football_players

Write a query that counts the number of 300lb+ players for each of the following regions: West Coast (CA, OR, WA), Texas, and Other (everywhere else).

In [None]:
SELECT CASE
           WHEN state IN ('CA',
                          'OR',
                          'WA') THEN 'West Coast'
           WHEN state = 'TX' THEN 'Texas'
           ELSE 'Other'
       END AS heavyweights,
       COUNT(1) AS count
FROM benn.college_football_players
WHERE weight > 299
GROUP BY 1

Write a query that calculates the combined weight of all underclass players (FR/SO) in California as well as the combined weight of all upperclass players (JR/SR) in California.

In [None]:
SELECT CASE
           WHEN year IN ('FR',
                         'SO') THEN 'underclass'
           WHEN year IN ('JR',
                         'SR') THEN 'upperclass'
           ELSE NULL
       END AS class,
       SUM(weight) AS combined_weight
FROM benn.college_football_players
WHERE state = 'CA'
GROUP BY 1

Write a query that displays the number of players in each state, with FR, SO, JR, and SR players in separate columns and another column for the total number of players. Order results such that states with the most players come first.

In [None]:
SELECT state,
       COUNT(CASE
                 WHEN year = 'FR' THEN 1
                 ELSE NULL
             END) as fr_count,
       COUNT(CASE
                 WHEN year = 'SO' THEN 1
                 ELSE NULL
             END) as so_count,
       COUNT(CASE
                 WHEN year = 'JR' THEN 1
                 ELSE NULL
             END) as jr_count,
       COUNT(CASE
                 WHEN year = 'SR' THEN 1
                 ELSE NULL
             END) as sr_count,
       COUNT(*) AS state_count
FROM benn.college_football_players
GROUP BY state
ORDER BY state_count DESC;

Write a query that shows the number of players at schools with names that start with A through M, and the number at schools with names starting with N - Z.

In [None]:
SELECT CASE
           WHEN school_name < 'n' THEN 'A-M'
           WHEN school_name > 'm' THEN 'N-Z'
           ELSE NULL
       END AS school_group,
       COUNT(1) AS players
FROM benn.college_football_players
GROUP BY school_group

Instead of using where, this uses the > comparative in the join to only retain values that are greater than 5 years from the company start date. A where clause is fine, but it then introduces an additional step.

In [None]:
SELECT companies.permalink,
       companies.name,
       companies.status,
       COUNT(investments.investor_permalink) AS investors
FROM tutorial.crunchbase_companies companies
LEFT JOIN tutorial.crunchbase_investments_part1 investments ON companies.permalink = investments.company_permalink
AND investments.funded_year > companies.founded_year + 5
GROUP BY 1,
         2,
         3

Changing data types can be done with CAST, CONVERT or ::

This changes a date and num to varchar

In [None]:
SELECT CAST(funding_total_usd AS varchar) AS funding_str,
       founded_at_clean::varchar AS founded_str
FROM tutorial.crunchbase_companies_clean_date

Write a query that counts the number of companies acquired within 3 years, 5 years, and 10 years of being founded (in 3 separate columns). Include a column for total companies acquired as well. Group by category and limit to only rows with a founding date.


In [None]:
SELECT companies.category_code,
       COUNT(CASE
                 WHEN acquisitions.acquired_at_cleaned <= companies.founded_at_clean::timestamp + INTERVAL '3 years' THEN 1
                 ELSE NULL
             END) AS acquired_3_yrs,
       COUNT(CASE
                 WHEN acquisitions.acquired_at_cleaned <= companies.founded_at_clean::timestamp + INTERVAL '5 years' THEN 1
                 ELSE NULL
             END) AS acquired_5_yrs,
       COUNT(CASE
                 WHEN acquisitions.acquired_at_cleaned <= companies.founded_at_clean::timestamp + INTERVAL '10 years' THEN 1
                 ELSE NULL
             END) AS acquired_10_yrs,
       COUNT(1) AS total
FROM tutorial.crunchbase_companies_clean_date companies
JOIN tutorial.crunchbase_acquisitions_clean_date acquisitions ON acquisitions.company_permalink = companies.permalink
WHERE founded_at_clean IS NOT NULL
GROUP BY 1
ORDER BY 5 DESC

# SQL String Functions


RIGHT and LEFT pull the number of characters from that side of the input

In [None]:
SELECT incidnt_num, date, LEFT(date, 10) AS cleaned_date,
                          RIGHT(date, 17) AS cleaned_time
FROM tutorial.sf_crime_incidents_2014_01

LENGTH or LEN can also be used as part of a function if the full value is variable, but the relative location is the same

In [None]:
SELECT incidnt_num, date, LEFT(date, 10) AS cleaned_date,
                          RIGHT(date, LENGTH(date) - 11) AS cleaned_time
FROM tutorial.sf_crime_incidents_2014_01

Trim can remove characters from the beginning and end of a string, by specifying 'leading', 'trailing' or 'both'

In [None]:
SELECT location,
       TRIM(both '()'
            FROM location)
FROM tutorial.sf_crime_incidents_2014_01

POSITION and STRPOS
Position gives the location of the quoted values in the string - it IS case-sensitive

STRPOS basically does the same thing with slightly different syntax

In [None]:
SELECT incidnt_num,
       descript,
       POSITION('A' IN descript) AS a_position
FROM tutorial.sf_crime_incidents_2014_01;


SELECT incidnt_num,
       descript,
       STRPOS(descript, 'A') AS a_position
FROM tutorial.sf_crime_incidents_2014_01

SUBSTR  This does the same thing as LEFT and RIGHT, but you can start anywhere within the string.

Syntax:  SUBSTR(string, start position, # of characters)

In [None]:
SELECT  incidnt_num, 
        date, 
        SUBSTR(date, 4, 2) AS day
FROM tutorial.sf_crime_incidents_2014_01

CONCAT just concatenates shit

|| can also be used to concatenate

In [None]:
SELECT incidnt_num,
       day_of_week,
       LEFT(date, 10) AS cleaned_date,
       CONCAT(day_of_week, ', ', LEFT(date, 10)) AS day_and_date
FROM tutorial.sf_crime_incidents_2014_01

In [None]:
-- Alternatively

SELECT incidnt_num,
       day_of_week,
       LEFT(date, 10) AS cleaned_date,
       day_of_week || ', ' || LEFT(date, 10) AS day_and_date
FROM tutorial.sf_crime_incidents_2014_01

UPPER and LOWER do exactly what you think they do


## Dates 
Dealing with dates sucks. This code casts the newly formatted and concatenated date to date format. Note, it uses || for concatenation and :: for casting

In [None]:
SELECT  incidnt_num, 
        date, 
        (SUBSTR(date, 7, 4) || '-' || LEFT(date, 2) || '-' || SUBSTR(date, 4, 2))::date AS cleaned_date
FROM tutorial.sf_crime_incidents_2014_01

EXTRACT can pull different parts of dates

In [None]:
SELECT cleaned_date,
       EXTRACT('year'
               FROM cleaned_date) AS year,
       EXTRACT('month'
               FROM cleaned_date) AS month,
       EXTRACT('day'
               FROM cleaned_date) AS day,
       EXTRACT('hour'
               FROM cleaned_date) AS hour,
       EXTRACT('minute'
               FROM cleaned_date) AS minute,
       EXTRACT('second'
               FROM cleaned_date) AS second,
       EXTRACT('decade'
               FROM cleaned_date) AS decade,
       EXTRACT('dow'
               FROM cleaned_date) AS day_of_week
FROM tutorial.sf_crime_incidents_cleandate

DATE_TRUNC can round to the nearest unit of measurement - good for rounding to the nearest week or day ... 

In [None]:
SELECT cleaned_date,
       DATE_TRUNC('year', cleaned_date) AS year,
       DATE_TRUNC('month', cleaned_date) AS month,
       DATE_TRUNC('week', cleaned_date) AS week,
       DATE_TRUNC('day', cleaned_date) AS day,
       DATE_TRUNC('hour', cleaned_date) AS hour,
       DATE_TRUNC('minute', cleaned_date) AS minute,
       DATE_TRUNC('second', cleaned_date) AS second,
       DATE_TRUNC('decade', cleaned_date) AS decade
FROM tutorial.sf_crime_incidents_cleandate

Including today's date or time...

In [None]:
SELECT CURRENT_DATE AS date,
       CURRENT_TIME AS time,
       CURRENT_TIMESTAMP AS timestamp,
       LOCALTIME AS localtime,
       LOCALTIMESTAMP AS localtimestamp,
       NOW() AS now

In [None]:
SELECT CURRENT_TIME AS time,
       CURRENT_TIME AT TIME ZONE 'PST' AS time_pst

COALESCE

This helps deal with nulls or weird things like 0 written as zero

In [None]:
SELECT incidnt_num,
       descript,
       COALESCE(descript, 'No Description')
FROM tutorial.sf_crime_incidents_2014_01
ORDER BY descript DESC

## SUBQUERIES

Write a query that selects all Warrant Arrests from the tutorial.sf_crime_incidents_2014_01 dataset, then wrap it in an outer query that only displays unresolved incidents.

In [None]:
SELECT warrants.*
FROM (
  SELECT * 
  FROM tutorial.sf_crime_incidents_2014_01
  WHERE descript = 'WARRANT ARREST'
) warrants
WHERE resolution = 'NONE'


Write a query that displays the average number of monthly incidents for each category. Hint: use tutorial.sf_crime_incidents_cleandate to make your life a little easier.

In [None]:
SELECT sub.category,
       CAST(AVG(sub.incidents) AS int) AS avg_monthly_incidents
FROM
    ( SELECT EXTRACT('month'
                     FROM cleaned_date) AS month,
             category,
             COUNT(1) AS incidents
     FROM tutorial.sf_crime_incidents_cleandate
     GROUP BY 1,
              2) sub
GROUP BY sub.category

Write a query that displays all rows from the three categories with the fewest incidents reported.

In [None]:
SELECT incidents.*,
       sub.count AS num_incidents
FROM tutorial.sf_crime_incidents_2014_01 incidents
INNER JOIN
    ( SELECT category,
             COUNT(*) AS count
     FROM tutorial.sf_crime_incidents_2014_01
     GROUP BY category
     ORDER BY count
     LIMIT 3) sub ON sub.category = incidents.category

Write a query that counts the number of companies founded and acquired by quarter starting in Q1 2012. Create the aggregations in two separate queries, then join them.

In [None]:

SELECT COALESCE(companies.quarter, acquisitions.quarter) AS quarter,
       companies.companies_founded,
       acquisitions.companies_acquired
FROM
    ( SELECT founded_quarter AS quarter,
             COUNT(permalink) AS companies_founded
     FROM tutorial.crunchbase_companies
     WHERE founded_year >= 2012
     GROUP BY 1 ) companies
LEFT JOIN
    ( SELECT acquired_quarter AS quarter,
             COUNT(DISTINCT company_permalink) AS companies_acquired
     FROM tutorial.crunchbase_acquisitions
     WHERE acquired_year >= 2012
     GROUP BY 1 ) acquisitions ON companies.quarter = acquisitions.quarter
ORDER BY 1

Write a query that ranks investors from the combined dataset above by the total number of investments they have made.

In [None]:
SELECT sub.investor_name,
       COUNT(*) AS investments
FROM
    ( SELECT *
     FROM tutorial.crunchbase_investments_part1
     UNION ALL SELECT *
     FROM tutorial.crunchbase_investments_part2) sub
GROUP BY sub.investor_name
ORDER BY investments DESC

Write a query that does the same thing as in the previous problem, except only for companies that are still operating. Hint: operating status is in tutorial.crunchbase_companies.

In [None]:
SELECT investments.investor_name,
       COUNT(investments.*) AS investments
FROM tutorial.crunchbase_companies companies
JOIN
    ( SELECT *
     FROM tutorial.crunchbase_investments_part1
     UNION ALL SELECT *
     FROM tutorial.crunchbase_investments_part2 ) investments ON investments.company_permalink = companies.permalink
WHERE companies.status = 'operating'
GROUP BY 1
ORDER BY 2 DESC

# WINDOW FUNCTIONS
## OVER

OVER (ORDER BY )
OVER (PARTITION BY )

The order and the partition define what is the window. 

Write a query modification of the above example query that shows the duration of each ride as a percentage of the total time accrued by riders from each start_terminal

In [None]:
SELECT start_terminal,
       duration_seconds,
       SUM(duration_seconds) OVER (PARTITION BY start_terminal) AS start_terminal_sum,
                                  (duration_seconds/SUM(duration_seconds) OVER (PARTITION BY start_terminal))*100 AS pct_of_total_time
FROM tutorial.dc_bikeshare_q1_2012
WHERE start_time < '2012-01-08'
ORDER BY 1,
         4 DESC

When using window functions, you can apply the same aggregates that you would under normal circumstances—SUM, COUNT, and AVG. The easiest way to understand these is to re-run the previous example with some additional functions

In [None]:
SELECT start_terminal,
       duration_seconds,
       SUM(duration_seconds) OVER
         (PARTITION BY start_terminal ORDER BY start_time)
         AS running_total,
       COUNT(duration_seconds) OVER
         (PARTITION BY start_terminal ORDER BY start_time)
         AS running_count,
       AVG(duration_seconds) OVER
         (PARTITION BY start_terminal ORDER BY start_time)
         AS running_avg
  FROM tutorial.dc_bikeshare_q1_2012
 WHERE start_time < '2012-01-08'

Write a query that shows a running total of the duration of bike rides (similar to the last example), but grouped by end_terminal, and with ride duration sorted in descending order.


In [None]:
SELECT end_terminal,
       duration_seconds,
       SUM(duration_seconds) OVER (PARTITION BY end_terminal
                                   ORDER BY duration_seconds DESC) AS running_total
FROM tutorial.dc_bikeshare_q1_2012
WHERE start_time < '2012-01-08'

## ROW_NUMBER()

Starts by numbering rows per partition

In [None]:
SELECT start_terminal,
       start_time,
       duration_seconds,
       ROW_NUMBER() OVER (
                          ORDER BY start_time) AS row_number
FROM tutorial.dc_bikeshare_q1_2012
WHERE start_time < '2012-01-08'

In [None]:
SELECT start_terminal,
       start_time,
       duration_seconds,
       ROW_NUMBER() OVER (PARTITION BY start_terminal
                          ORDER BY start_time) AS row_number
FROM tutorial.dc_bikeshare_q1_2012
WHERE start_time < '2012-01-08'

## RANK() and DENSE_RANK()

Rank:   1, 2, 2, 4, 5, 5, 5, 8

Dense_Rank:  1, 2, 2, 3, 4, 4, 4, 5

Write a query that shows the 5 longest rides from each starting terminal, ordered by terminal, and longest to shortest rides within each terminal. Limit to rides that occurred before Jan. 8, 2012.


In [None]:
SELECT *
FROM
    (SELECT start_terminal,
            duration_seconds,
            RANK() OVER (PARTITION BY start_terminal
                         ORDER BY duration_seconds DESC) AS rank
     FROM tutorial.dc_bikeshare_q1_2012
     WHERE start_time < '2012-01-08' ) sub
WHERE sub.rank <=5

Write a query that shows only the duration of the trip and the percentile into which that duration falls (across the entire dataset—not partitioned by terminal).

In [None]:
SELECT duration_seconds,
       NTILE(100) OVER (
                        ORDER BY duration_seconds) AS percentile
FROM tutorial.dc_bikeshare_q1_2012
WHERE start_time < '2012-01-08'
ORDER BY duration DESC

## LAG and LEAD 

Pulling from the last 1-few rows or leading 1-few rows

In [None]:
SELECT start_terminal,
       duration_seconds,
       LAG(duration_seconds, 1) OVER
         (PARTITION BY start_terminal ORDER BY duration_seconds) AS lag,
       LEAD(duration_seconds, 1) OVER
         (PARTITION BY start_terminal ORDER BY duration_seconds) AS lead
  FROM tutorial.dc_bikeshare_q1_2012
 WHERE start_time < '2012-01-08'
 ORDER BY start_terminal, duration_seconds

Good for fiding differences between rows

In [None]:
SELECT start_terminal,
       duration_seconds,
       duration_seconds -LAG(duration_seconds, 1) OVER (PARTITION BY start_terminal
                                                        ORDER BY duration_seconds) AS difference
FROM tutorial.dc_bikeshare_q1_2012
WHERE start_time < '2012-01-08'
ORDER BY start_terminal,
         duration_seconds

## Creating a Window Alias

If using several window functions in the same querys using the same window - use an alias... like the NTILE example

In [None]:
SELECT start_terminal,
       duration_seconds,
       NTILE(4) OVER
         (PARTITION BY start_terminal ORDER BY duration_seconds)
         AS quartile,
       NTILE(5) OVER
         (PARTITION BY start_terminal ORDER BY duration_seconds)
         AS quintile,
       NTILE(100) OVER
         (PARTITION BY start_terminal ORDER BY duration_seconds)
         AS percentile
  FROM tutorial.dc_bikeshare_q1_2012
 WHERE start_time < '2012-01-08'
 ORDER BY start_terminal, duration_seconds

The WINDOW clause should Always come after the WHERE clause


In [None]:
--Rewritten with the WINDOW alias 

SELECT start_terminal,
       duration_seconds,
       NTILE(4) OVER ntile_window AS quartile,
       NTILE(5) OVER ntile_window AS quintile,
       NTILE(100) OVER ntile_window AS percentile
  FROM tutorial.dc_bikeshare_q1_2012
 WHERE start_time < '2012-01-08'
WINDOW ntile_window AS
         (PARTITION BY start_terminal ORDER BY duration_seconds)
 ORDER BY start_terminal, duration_seconds