## Create a New Database [(tutorial)](https://www.sqlitetutorial.net/sqlite-python/creating-database/)
There are CSV files in the current directory, transform them into SQLite3 databases

In [19]:
import sqlite3
import pandas as pd

con = sqlite3.connect("tutorial.db")
cur = con.cursor()
cur.execute("DROP TABLE IF EXISTS films")
names = ['id', 'title', 'release_year', 'country', 'duration', 'language', 'certification', 'gross', 'budget']
films = pd.read_csv("films.csv", names=names)
films.to_sql('films', con, if_exists='append', index = False, index_label='id', chunksize = 10000)

cur.execute("DROP TABLE IF EXISTS people")
names = ['id', 'name', 'birthdate', 'deathdate']
people = pd.read_csv("people.csv", names=names)
people.to_sql('people', con, if_exists='append', index = False, index_label='id', chunksize = 10000)


cur.execute("DROP TABLE IF EXISTS reviews")
names = ['film_id', 'num_user', 'num_critic', 'imdb_score', 'num_votes', 'facebook_likes']
reviews = pd.read_csv("reviews.csv", names=names)
reviews.to_sql('reviews', con, if_exists='append', index = False, index_label='id', chunksize = 10000)

con.close()


In [61]:
# connect to database
con = sqlite3.connect("tutorial.db")
cur = con.cursor()

# Queries to table "people"

### COUNT(*) tells you how many records are in a table.   However, if you want to count the number of non-missing values in a particular field, you can call COUNT() on just that field.

### Looking at the differences between the count of separate fields values and the count of all records can provide useful insights into your data.

In [10]:
# Count the number of records in the people table
pd.read_sql("""SELECT COUNT(id) AS count_records 
               FROM people;""",
            con)


Unnamed: 0,count_records
0,8397


In [11]:
# Count the number of birthdates in the people table
pd.read_sql("""SELECT COUNT(birthdate) AS count_birthdate
               FROM people;""",
            con)


Unnamed: 0,count_birthdate
0,6152


In [12]:
# Calculate the percentage of people who are no longer alive

pd.read_sql("""SELECT ROUND(COUNT(deathdate) * 100.0 / COUNT(*), 2) AS percentage_dead
               FROM people;""",
            con)

Unnamed: 0,percentage_dead
0,9.37


### The LIKE and NOT LIKE operators can be used to find records that either match or do not match a specified pattern, respectively. 

They can be coupled with the wildcards % and _. 

The % will match zero or many characters, and _ will match a single character.

This is useful when you want to filter text, but not to an exact word.

In [13]:
# Select the names that start with B

pd.read_sql("""SELECT name
               FROM people
               WHERE name LIKE 'B%';""",
            con)

Unnamed: 0,name
0,B.J. Novak
1,Babak Najafi
2,Babar Ahmed
3,Bahare Seddiqi
4,Bai Ling
...,...
440,Buster Keaton
441,Busy Philipps
442,Buzz Aldrin
443,Byron Howard


In [14]:
# Select the names that have r as the second letter

pd.read_sql("""SELECT name
               FROM people
               WHERE name LIKE '_r%'""",
            con)

Unnamed: 0,name
0,Ara Celi
1,Aramis Knight
2,Arben Bajraktaraj
3,Arcelia RamÃ­rez
4,Archie Kao
...,...
526,Troy Garity
527,Troy Miller
528,Troy Nixey
529,Ursula Andress


In [15]:
# Select names that don't start with A

pd.read_sql("""SELECT name
               FROM people
               WHERE name NOT LIKE 'A%'""",
            con)

# Our results still had names that started with Á with an accent, 
# showing that we need to be specific with our filtering criteria.

Unnamed: 0,name
0,50 Cent
1,Ãlex Angulo
2,Ãlex de la Iglesia
3,Ãngela Molina
4,B.J. Novak
...,...
7763,Zohra Segal
7764,Zooey Deschanel
7765,Zoran Lisinac
7766,Zubaida Sahar


In [16]:
# Select name from people and sort alphabetically

pd.read_sql("""SELECT name
               FROM people
               ORDER BY name ASC;""",
            con)

Unnamed: 0,name
0,50 Cent
1,A. Michael Baldwin
2,A. Raven Cruz
3,A.J. Buckley
4,A.J. DeLucia
...,...
8392,Ã“scar Jaenada
8393,Ã‰mile Gaudreault
8394,Ã‰milie Dequenne
8395,Ã‰ric Tessier


# Queries to table "films"

### COUNT(*) tells you how many records are in a table. However, if you want to count the number of non-missing values in a particular field, you can call COUNT() on just that field.

In [21]:
# Count the languages and countries represented in the films table

"""Looking at the differences between the count of separate fields values 
and the count of all records can provide useful insights into your data."""

pd.read_sql("""SELECT COUNT(*) AS count_all_records, 
               COUNT(language) AS count_languages, 
               COUNT(country) AS count_countries
               FROM films;""",
            con)

Unnamed: 0,count_all_records,count_languages,count_countries
0,4968,4957,4966


### Often query results will include many duplicate values. You can use the DISTINCT keyword to select the unique values from a field.

In [22]:
# Return the unique countries from the films table

pd.read_sql("""SELECT DISTINCT country 
               FROM films;""",
            con)

Unnamed: 0,country
0,USA
1,Germany
2,Japan
3,Denmark
4,UK
...,...
60,Kenya
61,Slovenia
62,Pakistan
63,Chile


In [23]:
# Count the distinct countries from the films table

pd.read_sql("""SELECT COUNT(DISTINCT country) AS count_distinct_countries
               FROM films;""",
            con)

Unnamed: 0,count_distinct_countries
0,64


### Filtering with WHERE allows you to analyze your data better.  WHERE can also filter string values.

In [24]:
# Count the Spanish-language films

pd.read_sql("""SELECT COUNT(*) AS count_spanish
               FROM films
               WHERE language = 'Spanish';""",
            con)

Unnamed: 0,count_spanish
0,40


In [25]:
# Select the title and release_year for all German-language films released before 2000

pd.read_sql("""SELECT title, release_year
               FROM films
               WHERE language = 'German' 
                   AND release_year < 2000;""",
            con)


Unnamed: 0,title,release_year
0,Metropolis,1927.0
1,Pandora's Box,1929.0
2,The Torture Chamber of Dr. Sadism,1967.0
3,Das Boot,1981.0
4,Run Lola Run,1998.0
5,Aimee & Jaguar,1999.0


In [26]:
# Select all records for German-language films released after 2000 and before 2010

pd.read_sql("""SELECT *
               FROM films
               WHERE release_year > 2000
                   AND release_year < 2010
                   AND language = 'German';""",
            con)

Unnamed: 0,id,title,release_year,country,duration,language,certification,gross,budget
0,1952,Good Bye Lenin!,2003.0,Germany,121.0,German,R,4063859.0,4800000.0
1,2130,Downfall,2004.0,Germany,178.0,German,R,5501940.0,13500000.0
2,2224,Summer Storm,2004.0,Germany,98.0,German,R,95016.0,2700000.0
3,2709,The Lives of Others,2006.0,Germany,137.0,German,R,11284657.0,2000000.0
4,3100,The Baader Meinhof Complex,2008.0,Germany,184.0,German,R,476270.0,20000000.0
5,3143,The Wave,2008.0,Germany,107.0,German,,,5000000.0
6,3220,Cargo,2009.0,Switzerland,112.0,German,,,4500000.0
7,3346,Soul Kitchen,2009.0,Germany,99.0,German,,274385.0,4000000.0
8,3412,The White Ribbon,2009.0,Germany,144.0,German,R,2222647.0,12000000.0


In [27]:
# write a query to get the title and release_year of films released in 1990 or 1999, 
# which were in English or Spanish and took in more than $2,000,000 gross

pd.read_sql("""SELECT title, release_year
               FROM films
               WHERE (release_year = 1990 OR release_year = 1999)
                   AND (language = 'English' OR language = 'Spanish')
                   AND gross > 2000000;""",
            con)

Unnamed: 0,title,release_year
0,Arachnophobia,1990.0
1,Back to the Future Part III,1990.0
2,Child's Play 2,1990.0
3,Dances with Wolves,1990.0
4,Days of Thunder,1990.0
...,...,...
163,Trippin',1999.0
164,Universal Soldier: The Return,1999.0
165,Varsity Blues,1999.0
166,Wild Wild West,1999.0


### Using WHERE with a combination of AND, OR, and BETWEEN is an efficient way to query a desired range of values.

In [28]:
# get the title and release_year of all Spanish-language films 
# released between 1990 and 2000 (inclusive) with budgets over $100 million.


pd.read_sql("""SELECT title, release_year
               FROM films
               WHERE release_year BETWEEN 1990 AND 2000
                   AND budget > 100000000
                   AND (language = 'Spanish' OR language = 'French');""",
            con)

Unnamed: 0,title,release_year
0,Les couloirs du temps: Les visiteurs II,1998.0
1,Tango,1998.0


### You can query multiple conditions using the IN operator and a set of parentheses

In [29]:
# Find the title and release_year for all films over two hours in length released in 1990 and 2000


pd.read_sql("""SELECT title, release_year
               FROM films
               WHERE release_year IN (1990, 2000)
               AND duration > 120;""",
            con)

Unnamed: 0,title,release_year
0,Dances with Wolves,1990.0
1,Die Hard 2,1990.0
2,Ghost,1990.0
3,Goodfellas,1990.0
4,Mo' Better Blues,1990.0
5,Pretty Woman,1990.0
6,The Godfather: Part III,1990.0
7,The Hunt for Red October,1990.0
8,All the Pretty Horses,2000.0
9,Almost Famous,2000.0


In [30]:
# Find the title and language of all films in English, Spanish, and French
pd.read_sql("""SELECT title, language
               FROM films
               WHERE language IN ('English', 'Spanish', 'French');""",
            con)

Unnamed: 0,title,language
0,The Broadway Melody,English
1,Hell's Angels,English
2,A Farewell to Arms,English
3,42nd Street,English
4,She Done Him Wrong,English
...,...,...
4742,Twisted,English
4743,Unforgotten,English
4744,Wings,English
4745,Wolf Creek,English


In [31]:
# Find the title, certification, and language all films certified NC-17 or R that are in English, Italian, or Greek

pd.read_sql("""SELECT title, certification, language
               FROM films
               WHERE certification IN ('NC-17', 'R')
               AND language IN ('English', 'Italian', 'Greek');""",
            con)

Unnamed: 0,title,certification,language
0,Psycho,R,English
1,A Fistful of Dollars,R,Italian
2,Rosemary's Baby,R,English
3,The Wild Bunch,R,English
4,Catch-22,R,English
...,...,...,...
2001,The Neon Demon,R,English
2002,The Perfect Match,R,English
2003,The Purge: Election Year,R,English
2004,The Veil,R,English


### Find out how many 90's films we have in our dataset that would be suitable for English-speaking teens

In [32]:
pd.read_sql("""SELECT COUNT(DISTINCT title) AS nineties_english_films_for_teens
               FROM films
               WHERE release_year BETWEEN 1990 AND 1999
               AND language = 'English'
               AND certification IN ('G', 'PG', 'PG-13');""",
            con)

Unnamed: 0,nineties_english_films_for_teens
0,310


### Extract summary information from a table using aggregate functions

In [33]:
# Query the sum of film durations
pd.read_sql("""SELECT SUM(duration) AS total_duration
               FROM films;""",
            con)

Unnamed: 0,total_duration
0,534882.0


In [34]:
#  Calculate the average duration of all films
pd.read_sql("""SELECT AVG(duration) AS average_duration
               FROM films;""",
            con)

Unnamed: 0,average_duration
0,107.947931


In [35]:
# Find the latest release_year
pd.read_sql("""SELECT MAX(release_year) AS latest_year
               FROM films;""",
            con)

Unnamed: 0,latest_year
0,2016.0


In [36]:
# Find the duration of the shortest film
pd.read_sql("""SELECT MIN(duration) AS shortest_film
               FROM films;""",
            con)

Unnamed: 0,shortest_film
0,7.0


### When combining aggregate functions with WHERE, you get a powerful tool that allows you to get more granular with your insights.
### This combination is useful when you only want to summarize a subset of your data.

In [37]:
# Calculate the sum of gross from the year 2000 or later
pd.read_sql("""SELECT SUM(gross) AS total_gross
               FROM films
               WHERE release_year >= 2000;""",
            con)

Unnamed: 0,total_gross
0,150900900000.0


In [38]:
# Calculate the average gross of films that start with A
pd.read_sql("""SELECT AVG(gross) AS avg_gross_A
               FROM films
               WHERE title LIKE 'A%';""",
            con)

Unnamed: 0,avg_gross_A
0,47893240.0


In [39]:
# Calculate the lowest gross film in 1994
pd.read_sql("""SELECT MIN(gross) AS lowest_gross
               FROM films
               WHERE release_year = 1994;""",
            con)

Unnamed: 0,lowest_gross
0,125169.0


In [40]:
# Calculate the highest gross film released between 2000-2012
pd.read_sql("""SELECT MAX(gross) AS highest_gross
               FROM films
               WHERE release_year BETWEEN 2000 AND 2012;""",
            con)

Unnamed: 0,highest_gross
0,760505847.0


In [41]:
# Calculate the average budget rounded to the thousands
pd.read_sql("""SELECT ROUND(AVG(budget), -3) AS avg_budget_thousands
               FROM films;""",
            con)

Unnamed: 0,avg_budget_thousands
0,39902826.0


In [42]:
# Calculate duration in hours for all films
pd.read_sql("""SELECT title, ROUND((duration / 60.0), 2) AS duration_hours
               FROM films;""",
            con)

Unnamed: 0,title,duration_hours
0,Intolerance: Love's Struggle Throughout the Ages,2.05
1,Over the Hill to the Poorhouse,1.83
2,The Big Parade,2.52
3,Metropolis,2.42
4,Pandora's Box,1.83
...,...,...
4963,Unforgotten,0.75
4964,Wings,0.50
4965,Wolf Creek,
4966,Wuthering Heights,2.37


In [43]:
# Find the number of decades in the films table
pd.read_sql("""SELECT ROUND((MAX(release_year) - MIN(release_year)) / 10.0, 1) AS number_of_decades
               FROM films;""",
            con)

Unnamed: 0,number_of_decades
0,10.0


In [44]:
# Select the title and duration from longest to shortest film
pd.read_sql("""SELECT title, ROUND(duration / 60.0, 2) AS duration_hours
               FROM films
               WHERE duration IS NOT NULL
               ORDER BY duration DESC;""",
            con)

Unnamed: 0,title,duration_hours
0,Carlos,5.57
1,"Blood In, Blood Out",5.50
2,Heaven's Gate,5.42
3,The Legend of Suriyothai,5.00
4,Das Boot,4.88
...,...,...
4950,"10,000 B.C.",0.37
4951,Anger Management,0.37
4952,Wal-Mart: The High Cost of Low Price,0.33
4953,Vessel,0.23


In [45]:
# Select the release year, duration, and title sorted by release year and duration
pd.read_sql("""SELECT release_year, duration, title
               FROM films
               WHERE release_year IS NOT NULL
               ORDER BY release_year, duration;""",
            con)

Unnamed: 0,release_year,duration,title
0,1916.0,123.0,Intolerance: Love's Struggle Throughout the Ages
1,1920.0,110.0,Over the Hill to the Poorhouse
2,1925.0,151.0,The Big Parade
3,1927.0,145.0,Metropolis
4,1929.0,100.0,The Broadway Melody
...,...,...,...
4921,2016.0,144.0,13 Hours
4922,2016.0,144.0,X-Men: Apocalypse
4923,2016.0,147.0,Captain America: Civil War
4924,2016.0,156.0,The Wailing


In [46]:
# Select the certification, release year, and title sorted by certification and release year
pd.read_sql("""SELECT certification, release_year, title
               FROM films
               WHERE certification IS NOT NULL AND release_year IS NOT NULL
               ORDER BY certification, release_year;""",
            con)

Unnamed: 0,certification,release_year,title
0,Approved,1933.0,She Done Him Wrong
1,Approved,1935.0,Top Hat
2,Approved,1936.0,The Charge of the Light Brigade
3,Approved,1937.0,Snow White and the Seven Dwarfs
4,Approved,1937.0,The Prisoner of Zenda
...,...,...,...
4660,X,1986.0,The Texas Chainsaw Massacre 2
4661,X,1987.0,A Nightmare on Elm Street 3: Dream Warriors
4662,X,1987.0,Evil Dead II
4663,X,1989.0,A Nightmare on Elm Street 5: The Dream Child


### GROUP BY is a SQL keyword that allows to group and summarize results with the additional use of aggregate functions.

In [47]:
# Find the release_year and film_count of each year
pd.read_sql("""SELECT release_year, COUNT(title) AS film_count
               FROM films
               GROUP BY release_year;""",
            con)

Unnamed: 0,release_year,film_count
0,,42
1,1916.0,1
2,1920.0,1
3,1925.0,1
4,1927.0,1
...,...,...
87,2012.0,220
88,2013.0,236
89,2014.0,252
90,2015.0,226


In [48]:
# Find the release_year and average duration of films for each year
pd.read_sql("""SELECT release_year, ROUND(AVG(duration)) AS avg_duration
               FROM films
               GROUP BY release_year;""",
            con)

# Using GROUP BY with a time or date field 
# such as release_year can help us identify trends such as a period of time where movies were really short

Unnamed: 0,release_year,avg_duration
0,,77.0
1,1916.0,123.0
2,1920.0,110.0
3,1925.0,151.0
4,1927.0,145.0
...,...,...
87,2012.0,106.0
88,2013.0,108.0
89,2014.0,105.0
90,2015.0,106.0


In [49]:
# Find the release_year, country, and max_budget, then group and order by release_year and country
pd.read_sql("""SELECT release_year, country, MAX(budget) AS max_budget
               FROM films
               WHERE release_year IS NOT NULL AND country IS NOT NULL
               GROUP BY release_year, country
               ORDER BY release_year, country;""",
            con)

Unnamed: 0,release_year,country,max_budget
0,1916.0,USA,385907.0
1,1920.0,USA,100000.0
2,1925.0,USA,245000.0
3,1927.0,Germany,6000000.0
4,1929.0,Germany,
...,...,...,...
489,2016.0,Mexico,3000000.0
490,2016.0,Panama,20000000.0
491,2016.0,South Korea,12620000.0
492,2016.0,UK,175000000.0


In [50]:
# Quiz: which year had the greatest language diversity?

pd.read_sql("""SELECT release_year, COUNT(DISTINCT language) AS lang_count
               FROM films
               GROUP BY release_year
               ORDER BY lang_count DESC
               LIMIT 1;""",
            con)

Unnamed: 0,release_year,lang_count
0,2006.0,16


In [51]:
# find out which countries (or country) have the most varied film certifications (more than 10)

pd.read_sql("""SELECT country, COUNT(DISTINCT certification) AS certification_count
               FROM films
               GROUP BY country
               HAVING COUNT(DISTINCT certification) > 10;""",
            con)

Unnamed: 0,country,certification_count
0,USA,12


In [52]:
# what countries have the highest average film budgets (more than one billion)
pd.read_sql("""SELECT country, AVG(budget) AS average_budget
               FROM films
               GROUP BY country
               HAVING AVG(budget) > 1000000000
               ORDER BY average_budget DESC;""",
            con)

Unnamed: 0,country,average_budget
0,South Korea,1383960000.0
1,Hungary,1260000000.0


In [53]:
# write a query that returns the average budget and gross earnings for films each year after 1990 
# if the average budget is greater than 60 million

pd.read_sql("""SELECT release_year, ROUND(AVG(budget)) AS avg_budget, ROUND(AVG(gross)) AS avg_gross
               FROM films
               WHERE release_year > 1990
               GROUP BY release_year
               HAVING AVG(budget) > 60000000
               ORDER BY avg_gross DESC;""",
            con)

Unnamed: 0,release_year,avg_budget,avg_gross
0,2005.0,70323938.0,41159143.0
1,2006.0,93968930.0,39237856.0


# Queries to table "reviews"

### Filtering with WHERE allows you to analyze your data better.  WHERE can also filter string values.

In [54]:
# Select film_ids and imdb_score with an imdb_score over 7.0
pd.read_sql("""SELECT film_id, imdb_score
               FROM reviews
               WHERE imdb_score > 7.0;""",
            con)

Unnamed: 0,film_id,imdb_score
0,3934,7.1
1,74,7.6
2,1254,8.0
3,4841,8.1
4,3252,7.2
...,...,...
1531,199,8.0
1532,1814,7.2
1533,4158,8.0
1534,4086,7.1


In [55]:
# Select film_ids and facebook_likes for ten records with less than 1000 likes

pd.read_sql("""SELECT film_id, facebook_likes
               FROM reviews
               WHERE facebook_likes < 1000
               LIMIT 10;""",
            con)

Unnamed: 0,film_id,facebook_likes
0,3405,0
1,478,491
2,74,930
3,740,0
4,2869,689
5,1181,0
6,2020,0
7,2312,912
8,1820,872
9,831,975


In [56]:
# Count the records with at least 100,000 votes

pd.read_sql("""SELECT COUNT(num_votes) AS films_over_100K_votes
               FROM reviews
               WHERE num_votes > 100000;""",
            con)

Unnamed: 0,films_over_100K_votes
0,1211


### Aggregate functions work great with numerical values; however, these results can sometimes get unwieldy when dealing with long decimal values. Luckily, SQL provides us with the ROUND() function to tame these long decimals.

In [62]:
# Round the average number of facebook_likes to one decimal place

pd.read_sql("""SELECT ROUND(AVG(facebook_likes), 1) AS avg_facebook_likes
               FROM reviews;""",
            con)

# This insight can be used as a benchmark to measure film reviews; 
# any film with over 7802.9 likes can be considered popular.

Unnamed: 0,avg_facebook_likes
0,7802.9


In [59]:
con.close()