## SQL 1

In [1]:
# import statements
from sqlalchemy import create_engine, text
import pandas as pd
import os

#### Installation requirements

You can either run `pip3 install pandas` on your ssh session or try doing the installation from the notebook.

### Running linux commands from jupyter notebook

In [2]:
!pwd

/home/levmenzin/s25/lec/07-mysql


In [3]:
!ls

IMDB.zip	 sql2_template.ipynb  title.crew.tsv	    title.ratings.tsv
name.basics.tsv  title.akas.tsv       title.episode.tsv
sql2.ipynb	 title.basics.tsv     title.principals.tsv


In [4]:
!pip3 install pandas

Defaulting to user installation because normal site-packages is not writeable


In [5]:
engine = create_engine("mysql+mysqlconnector://root:abc@127.0.0.1:3306/cs639")
conn = engine.connect()

In [6]:
list(conn.execute(text("show tables;")))

[('name_basics',),
 ('title_akas',),
 ('title_basics',),
 ('title_crew',),
 ('title_episode',),
 ('title_principals',),
 ('title_ratings',)]

### Table creation

#### `Students` table
Let's create `Students` table with columns:
- `sid(int)` - primary key
- `name(text)` - required
- `gpa(float)`

In [None]:
list(conn.execute(text("show tables;")))

### Inserting data

Let's add a student: example: 101, "Alice", 3.2

In [None]:
conn.execute(text("""
    
"""))

In [None]:
conn.execute(text("""
    INSERT INTO Students (sid, name, gpa) 
    VALUES (123, "Bob", 3.8)
"""))

### Projection aka `SELECT` clause in SQL

Retrieving all or specific columns from a table.

### Updating data

Let's change Alice's GPA to 3.7

In [None]:
conn.execute(text("""
    
"""))

In [None]:
pd.read_sql("SELECT * FROM Students", conn)

#### `Courses` table
Let's create accounts `Courses` with columns:
- `cid(int)` - primary key
- `cname(text)` - required
- `credits(int)` - required

In [None]:
conn.execute(text("""
    create table Courses (
    cid int, 
    cname text ?, 
    credits int ?, 
    primary key(cid))
"""))

In [None]:
list(conn.execute(text("show tables;")))

### Table deletion

What if we wanted to delete a table?

In [None]:
conn.execute(text(""))

Let's recreate `Courses` table. This time, let's make `cid` type `VARCHAR(255)` instead of int.

In [None]:
conn.execute(text("""
    create table Courses (
    cid VARCHAR(255) PRIMARY KEY, 
    cname text NOT NULL, 
    credits int NOT NULL)
"""))

Let's insert the two courses from the slide example.

In [None]:
conn.execute(text("""
    INSERT INTO Courses (cid, cname, credits) 
    VALUES ("CS544", "Big Data", 3)
"""))
conn.execute(text("""
    INSERT INTO Courses (cid, cname, credits) 
    VALUES ("CS639", "Data Management", 3)
"""))

#### `Enrolled` table

Let's create `Enrolled` table with columns:
- sid(int) - foreign key
- cid(VARCHAR (255)) - foreign key
- grade(text)

In [None]:
conn.execute(text("""
    create table Enrolled (sid int, cid VARCHAR(255), grade text,
                           ?,
                           ?
"""))

In [None]:
list(conn.execute(text("show tables;")))

Let's add the erollments from the slide example.

In [None]:
conn.execute(text("""
    INSERT INTO Enrolled (sid, cid, grade) 
    VALUES (123, "CS544", "A")
"""))
conn.execute(text("""
    INSERT INTO Enrolled (sid, cid, grade) 
    VALUES (101, "CS639", "A")
"""))

In [None]:
pd.read_sql("SELECT * FROM Courses", conn)

What if we try to enroll a non-existing student?

In [None]:
# doesn't work - no foreign key mapping
conn.execute(text("""
    INSERT INTO Enrolled (sid, cid, grade) VALUES (10, "No one", "Nothing")
"""))

Commit the transaction.

In [None]:
conn.commit()

What if we try to delete Student with id 101 from Students table?

In [None]:
# doesn't work - foreign key prevents this
conn.execute(text("""
    DELETE FROM Students WHERE sid = 101
"""))

### Load CSVs to MySQL Tables

### Spotify dataset: https://ms.sites.cs.wisc.edu/cs639/data/spotify.zip

In [10]:
base_url = "https://ms.sites.cs.wisc.edu/cs639/data/"
df = pd.read_csv(base_url + "spotify.zip", compression="zip")
df

Unnamed: 0,Index,Highest Charting Position,Number of Times Charted,Week of Highest Charting,Song Name,Streams,Artist,Artist Followers,Song ID,Genre,...,Danceability,Energy,Loudness,Speechiness,Acousticness,Liveness,Tempo,Duration (ms),Valence,Chord
0,1,1,8,2021-07-23--2021-07-30,Beggin',48633449,Måneskin,3377762,3Wrjm47oTz2sjIgck11l5e,"['indie rock italiano', 'italian pop']",...,0.714,0.8,-4.808,0.0504,0.127,0.359,134.002,211560,0.589,B
1,2,2,3,2021-07-23--2021-07-30,STAY (with Justin Bieber),47248719,The Kid LAROI,2230022,5HCyWlXZPP0y6Gqq8TgA20,['australian hip hop'],...,0.591,0.764,-5.484,0.0483,0.0383,0.103,169.928,141806,0.478,C#/Db
2,3,1,11,2021-06-25--2021-07-02,good 4 u,40162559,Olivia Rodrigo,6266514,4ZtFanR9U6ndgddUvNcjcG,['pop'],...,0.563,0.664,-5.044,0.154,0.335,0.0849,166.928,178147,0.688,A
3,4,3,5,2021-07-02--2021-07-09,Bad Habits,37799456,Ed Sheeran,83293380,6PQ88X9TkUIAUIZJHW2upE,"['pop', 'uk pop']",...,0.808,0.897,-3.712,0.0348,0.0469,0.364,126.026,231041,0.591,B
4,5,5,1,2021-07-23--2021-07-30,INDUSTRY BABY (feat. Jack Harlow),33948454,Lil Nas X,5473565,27NovPIUIRrOZoCHxABJwK,"['lgbtq+ hip hop', 'pop rap']",...,0.736,0.704,-7.409,0.0615,0.0203,0.0501,149.995,212000,0.894,D#/Eb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,1552,195,1,2019-12-27--2020-01-03,New Rules,4630675,Dua Lipa,27167675,2ekn2ttSfGqwhhate0LSR0,"['dance pop', 'pop', 'uk pop']",...,0.762,0.7,-6.021,0.0694,0.00261,0.153,116.073,209320,0.608,A
1552,1553,196,1,2019-12-27--2020-01-03,Cheirosa - Ao Vivo,4623030,Jorge & Mateus,15019109,2PWjKmjyTZeDpmOUa3a5da,"['sertanejo', 'sertanejo universitario']",...,0.528,0.87,-3.123,0.0851,0.24,0.333,152.37,181930,0.714,B
1553,1554,197,1,2019-12-27--2020-01-03,Havana (feat. Young Thug),4620876,Camila Cabello,22698747,1rfofaqEpACxVEHIZBJe6W,"['dance pop', 'electropop', 'pop', 'post-teen ...",...,0.765,0.523,-4.333,0.03,0.184,0.132,104.988,217307,0.394,D
1554,1555,198,1,2019-12-27--2020-01-03,Surtada - Remix Brega Funk,4607385,"Dadá Boladão, Tati Zaqui, OIK",208630,5F8ffc8KWKNawllr5WsW0r,"['brega funk', 'funk carioca']",...,0.832,0.55,-7.026,0.0587,0.249,0.182,154.064,152784,0.881,F


In [11]:
pd.read_sql("SELECT * FROM songs", conn)

Unnamed: 0,Index,Highest Charting Position,Number of Times Charted,Week of Highest Charting,Song Name,Streams,Artist,Artist Followers,Song ID,Genre,...,Danceability,Energy,Loudness,Speechiness,Acousticness,Liveness,Tempo,Duration (ms),Valence,Chord


## SQL 2: Basic SQL clauses

Let's drop all the tables that we created so far.

In [None]:
# you have to drop the table with foreign key constraint first

conn.execute(text("drop table Students"))

conn.execute(text("drop table songs"))

In [7]:
list(conn.execute(text("show tables;")))

[('name_basics',),
 ('title_akas',),
 ('title_basics',),
 ('title_crew',),
 ('title_episode',),
 ('title_principals',),
 ('title_ratings',)]

### IMDB dataset

- Source: https://datasets.imdbws.com/ 
- Original dataset is too large to be analyzed using our current VM
- Schema information: https://developer.imdb.com/non-commercial-datasets/

Let's download a sampled version of the dataset.

In [None]:
!
!
!wget https://ms.sites.cs.wisc.edu/cs639/data/IMDB.zip
!unzip IMDB.zip

#### Populating MySQL server with tables corresponding to all tsv files

In [None]:
files = os.listdir()
files

In [None]:
tsv_files = [f for f in files if ".tsv" in f]
tsv_files

In [None]:
table_names = [f.replace(".tsv", "") for f in tsv_files]
table_names = [f.replace(".", "_") for f in table_names]
table_names

In [11]:
list(conn.execute(text("show tables;")))

[('name_basics',),
 ('title_akas',),
 ('title_basics',),
 ('title_crew',),
 ('title_episode',),
 ('title_principals',),
 ('title_ratings',)]

### Explore the tables

In [12]:
# name_basics
pd.read_sql("SELECT * FROM name_basics LIMIT 5", conn)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm7644510,Valentin Malaescu,,,actor,tt5072918
1,nm4867615,Christopher Heimann,,,miscellaneous,tt2198043
2,nm0912420,Natalie Denise Sperl,,,"actress,director,producer","tt10618286,tt10750482,tt12200650,tt0369179"
3,nm6994121,Amie Stephens,,,art_department,"tt1957938,tt2945374"
4,nm0171239,Tom Coleman,1907.0,1978.0,"art_department,production_designer,set_decorator","tt0051221,tt0047879,tt0055992,tt0050000"


In [13]:
# title_akas
pd.read_sql("SELECT * FROM title_akas LIMIT 5", conn)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle


In [14]:
# title_basics
pd.read_sql("SELECT * FROM title_basics LIMIT 5", conn)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000912,short,The Indian Runner's Romance,The Indian Runner's Romance,0,1909.0,,11.0,"Short,Western"
1,tt0013001,short,The Cashier,The Cashier,0,1922.0,,,"Animation,Comedy,Short"
2,tt0016344,movie,Shirayuri wa nageku,Shirayuri wa nageku,0,1925.0,,,
3,tt0017504,movie,Unseen Enemies,Unseen Enemies,0,1925.0,,54.0,Western
4,tt0024996,movie,Coming Out Party,Coming Out Party,0,1934.0,,80.0,Drama


In [15]:
# title_crew
pd.read_sql("SELECT * FROM title_crew LIMIT 5", conn)

Unnamed: 0,tconst,directors,writers
0,tt0000912,nm0000428,nm0853193
1,tt0013001,nm0279404,nm0279404
2,tt0016344,nm0003226,nm0793684
3,tt0017504,nm0569645,
4,tt0024996,nm0090007,"nm0306731,nm0881148,nm0489679"


In [16]:
# title_episode
pd.read_sql("SELECT * FROM title_episode LIMIT 5", conn)

Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0084769,tt0806910,1.0,140.0
1,tt0162689,tt0806910,1.0,203.0
2,tt0394320,tt0096542,2.0,18.0
3,tt0465362,tt0388656,,
4,tt0504908,tt0285351,3.0,16.0


In [17]:
# title_principals
pd.read_sql("SELECT * FROM title_principals LIMIT 5", conn)

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000912,1,nm0601698,actor,,"[""Blue Cloud""]"
1,tt0000912,2,nm0288616,actor,,"[""The Old Prospector""]"
2,tt0000912,3,nm0681933,actress,,"[""Blue Cloud's Wife""]"
3,tt0000912,4,nm0424530,actor,,"[""Cowboy""]"
4,tt0000912,5,nm0456804,actor,,"[""Cowboy""]"


In [18]:
# title_ratings
pd.read_sql("SELECT * FROM title_ratings LIMIT 5", conn)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000912,4.3,73
1,tt0017504,4.8,27
2,tt0024996,5.9,66
3,tt0029553,6.0,45
4,tt0030476,6.2,81


### Data Analysis

#### Q1: What are the movies?

In [19]:
pd.read_sql("""
SELECT * FROM title_basics WHERE titleType = 'movie'
""", conn)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0016344,movie,Shirayuri wa nageku,Shirayuri wa nageku,0,1925.0,,,
1,tt0017504,movie,Unseen Enemies,Unseen Enemies,0,1925.0,,54.0,Western
2,tt0024996,movie,Coming Out Party,Coming Out Party,0,1934.0,,80.0,Drama
3,tt0029553,movie,The Sheik Steps Out,The Sheik Steps Out,0,1937.0,,65.0,Musical
4,tt0035860,movie,The Fallen Sparrow,The Fallen Sparrow,0,1943.0,,94.0,"Film-Noir,Mystery"
...,...,...,...,...,...,...,...,...,...
183,tt8787458,movie,Gado,Gado,0,,,,Western
184,tt8906732,movie,A Song or Two to Make You Feel,A Song or Two to Make You Feel,0,2018.0,,54.0,Music
185,tt9198442,movie,My Hero Academia,My Hero Academia,0,,,,"Action,Adventure,Animation"
186,tt9642604,movie,Los hombres sin rostros,Los hombres sin rostros,0,2016.0,,59.0,Documentary


#### Q2: What are all the movie titles and their corresponding release years? Eliminate movies without release years.

Which table can we find this data from?

In [None]:
pd.read_sql("""
    SELECT 
    FROM title_basics
    WHERE titleType = 'movie'
""", conn)

#### Q3: How many movies are in this dataset?

In [None]:
pd.read_sql("""
    SELECT 
    FROM title_basics
    WHERE titleType = 'movie'
""", conn)

#### Q4: What are all the types of titles in this dataset?

In [None]:
pd.read_sql("""
    SELECT 
    FROM title_basics
""", conn)

#### Q5: How many title types are there in this dataset?

In [None]:
pd.read_sql("""
    SELECT 
    FROM title_basics
""", conn)

#### Q6: What are all the movies that got released in 2023?

#### Q7: What is the average rating of all titles?

In [None]:
pd.read_sql("""
    SELECT 
    FROM title_ratings
""", conn)

#### Q8: What are all the movies that have runtime greater than 2 hours?

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE titleType = 'movie' and 
""", conn)

#### Q9: What are all the "Comedy" movies?

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE genres
""", conn)

#### Q10: Find the total number of people in the dataset.

In [20]:
pd.read_sql("""
    SELECT
    FROM name_basics
""", conn)

ProgrammingError: (mysql.connector.errors.ProgrammingError) 1064 (42000): You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'FROM name_basics' at line 2
[SQL: 
    SELECT
    FROM name_basics
]
(Background on this error at: https://sqlalche.me/e/20/f405)

#### Q11: What are the distinct primary professions of individuals in the dataset?

#### Q12: What are the total number of alternate titles listed in the dataset?

#### Q13: What is the total runtime in the dataset?

In [None]:
pd.read_sql("""
    SELECT 
    FROM title_basics
    WHERE runtimeMinutes
""", conn)

#### Q14: What are the regions where alternate titles are available?

#### Q15: List the titles of movies along with their runtimes converted from minutes to hours.

In [None]:
pd.read_sql("""
    SELECT 
        primaryTitle, runtimeMinutes
    FROM title_basics
    WHERE titleType = "movie" AND runtimeMinutes IS NOT NULL;
""", conn)

#### Q16: What are all movies that got released between 2000 and 2010, inclusive?

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE titleType = "movie" AND 
""", conn)

### SQL Subqueries

#### Q17: What is the shortest movie released after 2010?

In [None]:
pd.read_sql("""
    SELECT
    FROM title_basics
    WHERE startYear > 2010 AND titleType = 'movie' AND runtimeMinutes IS NOT NULL
""", conn)

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE runtimeMinutes
""", conn)

#### Q18: What is the longest movie released after 2010?

#### Q19: What are the titles that have a runtime greater than the average runtime of all movies?

In [None]:
pd.read_sql("""
    SELECT 
    FROM title_basics
    WHERE runtimeMinutes IS NOT NULL
""", conn)

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE runtimeMinutes
""", conn)

#### Q20: What are the most recent movies?

#### Q21: Find the number of movies that have more than one genre.

We can find number of genres by simply counting number of commas and adding 1 to that count. Let's first determine length of genres column.

In [None]:
pd.read_sql("""
    SELECT genres, ?
    FROM title_basics
""", conn)

To find, number of commas, we can replace commas with nothing and find difference between original string and the replaced string.

In [None]:
pd.read_sql("""
    SELECT genres, LENGTH(genres) - LENGTH(REPLACE(genres, ',', '')) + 1
    FROM title_basics
""", conn)

Now putting it together in a subquery.

In [None]:
pd.read_sql("""
    SELECT COUNT(*)
    FROM title_basics
    WHERE (
        SELECT LENGTH(genres) - LENGTH(REPLACE(genres, ',', '')) + 1
    ) > 1;
""", conn)

#### Q22: Find the titles of movies that have the maximum number of genres.

In [None]:
pd.read_sql("""
    SELECT primaryTitle, genres
    FROM title_basics
    WHERE 
""", conn)

#### Q23: Find the titles of movies that belong to the same genres as those with a runtime longer than 150 minutes.

In [None]:
pd.read_sql("""
    SELECT genres
    FROM title_basics
    WHERE titleType = "movie" AND runtimeMinutes > 150 AND genres IS NOT NULL
""", conn)

In [None]:
pd.read_sql("""
    SELECT *
    FROM title_basics
    WHERE genres 
""", conn)

### JOINs

#### Q24: Find all movies and their corresponding ratings.

In [None]:
pd.read_sql("""
    
""", conn)

#### Q25: List all movies and their associated genres.

In [None]:
pd.read_sql("""
    
""", conn)

#### Q26: Find all crew members and the movies they worked on.

In [None]:
pd.read_sql("""
    
""", conn)