## SQL 2: Basic SQL clauses

In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
import os
import gc

In [2]:
engine = create_engine("mysql+mysqlconnector://root:abc@127.0.0.1:3306/cs639")
conn = engine.connect()

In [3]:
list(conn.execute(text("show tables;")))

[('name_basics',),
 ('title_akas',),
 ('title_basics',),
 ('title_crew',),
 ('title_episode',),
 ('title_principals',),
 ('title_ratings',)]

### IMDB dataset

- Source: https://datasets.imdbws.com/ 
- Original dataset is too large to be analyzed using our current VM
- Schema information: https://developer.imdb.com/non-commercial-datasets/

Let's download a sampled version of the dataset.

In [None]:
!wget https://ms.sites.cs.wisc.edu/cs639/data/IMDB.zip

In [None]:
!unzip IMDB.zip

#### Populating MySQL server with tables corresponding to all tsv files

In [None]:
files = os.listdir()
tsv_files = [f for f in files if ".tsv" in f]
table_names = [f.replace(".tsv", "") for f in tsv_files]
table_names = [f.replace(".", "_") for f in table_names]

In [None]:
for idx, tsv_file in enumerate(tsv_files):
    df = pd.read_csv(tsv_file, sep="\t", na_values='\\N')
    df.to_sql(table_names[idx], conn, index=False, if_exists="replace")
    print(f"Populated {table_names[idx]}")

In [None]:
list(conn.execute(text("show tables;")))

### Explore the tables

In [None]:
# name_basics
pd.read_sql("SELECT * FROM name_basics LIMIT 5", conn)

In [None]:
# title_akas
pd.read_sql("SELECT * FROM title_akas LIMIT 5", conn)

In [None]:
# title_basics
pd.read_sql("SELECT * FROM title_basics LIMIT 5", conn)

In [None]:
# title_crew
pd.read_sql("SELECT * FROM title_crew LIMIT 5", conn)

In [None]:
# title_episode
pd.read_sql("SELECT * FROM title_episode LIMIT 5", conn)

In [None]:
# title_principals
pd.read_sql("SELECT * FROM title_principals LIMIT 5", conn)

In [None]:
# title_ratings
pd.read_sql("SELECT * FROM title_ratings LIMIT 5", conn)

### Data Analysis

#### Q1: What are the movies?

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE titleType = 'movie'
""", conn)

#### Q2: What are all the movie titles and their corresponding release years? Eliminate movies without release years.

Which table can we find this data from?

In [None]:
pd.read_sql("""
    SELECT primaryTitle, startYear
    FROM title_basics
    WHERE titleType = 'movie' AND startYear IS NOT NULL
""", conn)

#### Q3: How many movies are in this dataset?

In [None]:
pd.read_sql("""
    SELECT COUNT(*) AS TotalMovies
    FROM title_basics
    WHERE titleType = 'movie'
""", conn)

#### Q4: What are all the types of titles in this dataset?

In [None]:
pd.read_sql("""
    SELECT DISTINCT titleType
    FROM title_basics
""", conn)

#### Q5: How many title types are there in this dataset?

In [None]:
pd.read_sql("""
    SELECT COUNT(DISTINCT titleType)
    FROM title_basics
""", conn)

#### Q6: What are all the movies that got released in 2023?

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE startYear = 2023 AND titleType = 'movie'
""", conn)

#### Q7: What is the average rating of all titles?

In [None]:
pd.read_sql("""
    SELECT AVG(averageRating) AS avg_movie_rating
    FROM title_ratings
""", conn)

#### Q8: What are all the movies that have runtime greater than 2 hours?

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE runtimeMinutes > 120 and titleType = 'movie'
""", conn)

#### Q9: What are all the "Comedy" movies?

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE genres LIKE "%Comedy%" 
""", conn)

#### Q10: Find the total number of people in the dataset.

In [7]:
pd.read_sql("""
    SELECT COUNT(*) AS totalPeople
    FROM name_basics
""", conn)

Unnamed: 0,totalPeople
0,2762


#### Q11: What are the distinct primary professions of individuals in the dataset?

In [14]:
pd.read_sql("""
    SELECT DISTINCT primaryProfession
    FROM name_basics
    WHERE primaryProfession IS NOT NULL
""", conn)

Unnamed: 0,primaryProfession
0,actor
1,miscellaneous
2,"actress,director,producer"
3,art_department
4,"art_department,production_designer,set_decorator"
...,...
415,"actor,art_department"
416,"editorial_department,actor"
417,"writer,actor,editorial_department"
418,"visual_effects,director,editor"


#### Q12: What are the total number of alternate titles listed in the dataset?

In [18]:
pd.read_sql("""
    SELECT COUNT(*) AS totalAlternateTitles
    FROM title_akas;
""", conn)

Unnamed: 0,totalAlternateTitles
0,12421


#### Q13: What is the total runtime in the dataset?

In [None]:
pd.read_sql("""
    SELECT SUM(runtimeMinutes) AS totalRuntime
    FROM title_basics
    WHERE runtimeMinutes IS NOT NULL;
""", conn)

#### Q14: What are the regions where alternate titles are available?

In [20]:
pd.read_sql("""
    SELECT DISTINCT region
    FROM title_akas
    WHERE region IS NOT NULL;
""", conn)

Unnamed: 0,region
0,US
1,RU
2,ES
3,FR
4,XWW
...,...
75,PK
76,HT
77,MG
78,DO


#### Q15: List the titles of movies along with their runtimes converted from minutes to hours.

In [21]:
pd.read_sql("""
    SELECT 
        primaryTitle, runtimeMinutes, 
        runtimeMinutes / 60 AS runTimeHours
    FROM title_basics
    WHERE titleType = "movie" AND runtimeMinutes IS NOT NULL;
""", conn)

Unnamed: 0,primaryTitle,runtimeMinutes,runTimeHours
0,Unseen Enemies,54.0,0.900000
1,Coming Out Party,80.0,1.333333
2,The Sheik Steps Out,65.0,1.083333
3,The Fallen Sparrow,94.0,1.566667
4,Oath of Vengeance,57.0,0.950000
...,...,...,...
117,Ordinary Gods,107.0,1.783333
118,Making Masculine,51.0,0.850000
119,A Song or Two to Make You Feel,54.0,0.900000
120,Los hombres sin rostros,59.0,0.983333


#### Q16: What are all movies that got released between 2000 and 2010, inclusive?

In [25]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE titleType = "movie" AND startYear BETWEEN 2000 AND 2010
""", conn)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0128154,movie,Daybreak,Daybreak,0,2002.0,,87.0,"Crime,Mystery,Thriller"
1,tt0217355,movie,Dancing at the Blue Iguana,Dancing at the Blue Iguana,0,2000.0,,123.0,"Drama,Mystery"
2,tt0228992,movie,An Outgoing Woman,Une femme d'extérieur,0,2000.0,,118.0,Drama
3,tt0268446,movie,Mask of Desire,Mukundo,0,2000.0,,105.0,Drama
4,tt0326988,movie,I'll Sing for You,Je chanterai pour toi,0,2001.0,,76.0,"Biography,Documentary,Drama"
5,tt0337857,movie,The Beat,The Beat,0,2003.0,,85.0,"Action,Comedy,Drama"
6,tt0349688,movie,A Little Bit of Freedom,Kleine Freiheit,0,2003.0,,102.0,Drama
7,tt0354836,movie,Podium,Podium,0,2004.0,,95.0,"Comedy,Music"
8,tt0430891,movie,Aria,Aria,0,2004.0,,57.0,Drama
9,tt0446789,movie,Standalone,Standalone,0,2005.0,,108.0,"Action,Crime,Drama"


### SQL Subqueries

#### Q17: What is the shortest movie released after 2010?

In [None]:
pd.read_sql("""
    SELECT MIN(runtimeMinutes)
    FROM title_basics
    WHERE startYear > 2010 AND titleType = 'movie' AND runtimeMinutes IS NOT NULL
""", conn)

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE runtimeMinutes = (
        SELECT MIN(runtimeMinutes)
        FROM title_basics
        WHERE startYear > 2010 AND titleType = 'movie' AND runtimeMinutes IS NOT NULL
    ) AND titleType = 'movie'
""", conn)

#### Q18: What is the longest movie released after 2010?

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE runtimeMinutes = (
        SELECT MAX(runtimeMinutes)
        FROM title_basics
        WHERE startYear > 2010 AND titleType = 'movie' AND runtimeMinutes IS NOT NULL
    ) AND titleType = 'movie'
""", conn)

#### Q19: What are the titles that have a runtime greater than the average runtime of all movies?

In [None]:
pd.read_sql("""
    SELECT AVG(runtimeMinutes)
    FROM title_basics
    WHERE runtimeMinutes IS NOT NULL
""", conn)

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE runtimeMinutes > (
        SELECT AVG(runtimeMinutes)
        FROM title_basics
        WHERE runtimeMinutes IS NOT NULL
)
""", conn)

#### Q20: What are the most recent movies?

In [None]:
pd.read_sql("""
    SELECT MAX(startYear)
    FROM title_basics
    WHERE titleType = 'movie'
""", conn)

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE startYear = (
        SELECT MAX(startYear)
        FROM title_basics
        WHERE titleType = 'movie'
    ) AND titleType = 'movie'
""", conn)

#### Q21: Find the number of movies that have more than one genre.

We can find number of genres by simply counting number of commas and adding 1 to that count. Let's first determine length of genres column.

In [None]:
pd.read_sql("""
    SELECT genres, LENGTH(genres)
    FROM title_basics
""", conn)

To find, number of commas, we can replace commas with nothing and find difference between original string and the replaced string.

In [None]:
pd.read_sql("""
    SELECT genres, LENGTH(genres) - LENGTH(REPLACE(genres, ',', '')) + 1
    FROM title_basics
""", conn)

Now putting it together in a subquery.

In [None]:
pd.read_sql("""
    SELECT COUNT(*)
    FROM title_basics
    WHERE (
        SELECT LENGTH(genres) - LENGTH(REPLACE(genres, ',', '')) + 1
    ) > 1;
""", conn)

#### Q22: Find the titles of movies that have the maximum number of genres.

In [None]:
pd.read_sql("""
    SELECT primaryTitle, genres
    FROM title_basics
    WHERE (
        SELECT LENGTH(genres) - LENGTH(REPLACE(genres, ',', '')) + 1
    ) = (
        SELECT MAX(LENGTH(genres) - LENGTH(REPLACE(genres, ',', '')) + 1)
        FROM title_basics
    )
""", conn)

#### Q23: Find the titles of movies that belong to the same genres as those with a runtime longer than 150 minutes.

In [28]:
pd.read_sql("""
        SELECT genres
        FROM title_basics
        WHERE titleType = "movie" AND runtimeMinutes > 150 AND genres IS NOT NULL
""", conn)

Unnamed: 0,genres
0,Drama


In [27]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE genres IN (
        SELECT genres
        FROM title_basics
        WHERE titleType = "movie" AND runtimeMinutes > 150 AND genres IS NOT NULL
)
""", conn)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0024996,movie,Coming Out Party,Coming Out Party,0,1934.0,,80.0,Drama
1,tt0084219,tvMovie,Die Komplizen,Die Komplizen,0,1985.0,,,Drama
2,tt0098516,movie,Trois pommes à côté du sommeil,Trois pommes à côté du sommeil,0,1989.0,,98.0,Drama
3,tt0101806,tvMovie,Elsa,Elsa,0,1991.0,,105.0,Drama
4,tt0173156,movie,Saajan Ka Ghar,Saajan Ka Ghar,0,1994.0,,153.0,Drama
...,...,...,...,...,...,...,...,...,...
292,tt9655972,tvEpisode,Episode #1.151,Episode #1.151,0,2017.0,,,Drama
293,tt9685774,tvMovie,The Farewell Girls,The Farewell Girls,0,2017.0,,86.0,Drama
294,tt9768578,tvEpisode,Episode #1.701,Episode #1.701,0,2012.0,,,Drama
295,tt9801116,tvEpisode,Episode #1.308,Episode #1.308,0,2010.0,,,Drama


### JOINs

#### Q24: Find all movies and their corresponding ratings.

In [37]:
pd.read_sql("""
    SELECT b.primaryTitle, r.averageRating
    FROM title_basics b
    JOIN title_ratings r ON b.tconst = r.tconst
    WHERE b.titleType = 'movie'
""", conn)

Unnamed: 0,primaryTitle,averageRating
0,Unseen Enemies,4.8
1,Coming Out Party,5.9
2,The Sheik Steps Out,6.0
3,The Fallen Sparrow,6.6
4,Oath of Vengeance,5.7
...,...,...
90,"Horror, Madness & Mayhem Vol 1 Snuff Party",7.2
91,Natha Pure Aata,4.9
92,Ordinary Gods,8.5
93,Los hombres sin rostros,6.8


#### Q25: List all movies and their associated genres.

In [41]:
pd.read_sql("""
    SELECT b.primaryTitle, b.genres
    FROM title_basics b
    LEFT JOIN title_akas a ON b.tconst = a.titleId;
""", conn)

Unnamed: 0,primaryTitle,genres
0,The Indian Runner's Romance,"Short,Western"
1,The Indian Runner's Romance,"Short,Western"
2,The Indian Runner's Romance,"Short,Western"
3,The Cashier,"Animation,Comedy,Short"
4,The Cashier,"Animation,Comedy,Short"
...,...,...
12424,Episode #1.372,"Action,Crime,Drama"
12425,Episode #1.372,"Action,Crime,Drama"
12426,Episode #1.372,"Action,Crime,Drama"
12427,Episode #1.372,"Action,Crime,Drama"


#### Q26: Find all crew members and the movies they worked on.

In [43]:
pd.read_sql("""
    SELECT *
    FROM title_crew c
    RIGHT JOIN title_basics b ON c.tconst = b.tconst
""", conn)

Unnamed: 0,tconst,directors,writers,tconst.1,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000912,nm0000428,nm0853193,tt0000912,short,The Indian Runner's Romance,The Indian Runner's Romance,0,1909.0,,11.0,"Short,Western"
1,tt0013001,nm0279404,nm0279404,tt0013001,short,The Cashier,The Cashier,0,1922.0,,,"Animation,Comedy,Short"
2,tt0016344,nm0003226,nm0793684,tt0016344,movie,Shirayuri wa nageku,Shirayuri wa nageku,0,1925.0,,,
3,tt0017504,nm0569645,,tt0017504,movie,Unseen Enemies,Unseen Enemies,0,1925.0,,54.0,Western
4,tt0024996,nm0090007,"nm0306731,nm0881148,nm0489679",tt0024996,movie,Coming Out Party,Coming Out Party,0,1934.0,,80.0,Drama
...,...,...,...,...,...,...,...,...,...,...,...,...
2769,tt9836540,"nm1516005,nm8498176,nm4443289","nm9494566,nm2761502,nm6945450",tt9836540,tvEpisode,Episode #1.48,Episode #1.48,0,2010.0,,,Drama
2770,tt9837390,,,tt9837390,tvEpisode,Customizable Emoji Shirt,Customizable Emoji Shirt,0,2019.0,,,"Family,Short"
2771,tt9847426,nm1227859,nm0341311,tt9847426,tvEpisode,All Star Games 4,All Star Games 4,0,2019.0,,30.0,Game-Show
2772,tt9854186,,,tt9854186,tvEpisode,Casino Royale Pitch Meeting: Introducing The B...,Casino Royale Pitch Meeting: Introducing The B...,0,2019.0,,,Comedy
