# SQL Assignment
#### In this assignment you will be solving a series of SQL Puzzles using the sqlalchemy library

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sqlalchemy

In [2]:
data_dir = Path("C:\\Users\\rvand\\Downloads\\Twitter Database.db")

In [3]:
data_dir

WindowsPath('C:/Users/rvand/Downloads/Twitter Database.db')

In [4]:
engine = sqlalchemy.create_engine(f"sqlite:///{data_dir}")

In [5]:
connection = engine.connect()

In [6]:
engine.table_names()

['dataset', 'hashtag_donaldtrump', 'hashtag_joebiden']

### Description of the tables
The 'dataset' table has a row for every tweet from a politician regarding the 2020 Election

The 'hashtag_donaldtrump' table has a row for every tweet written about Donald Trump

The 'hashtag_joebiden' table has a row for every tweet written about Joe Biden

### Here we are able to access the schema for the tables that we have in our database

In [7]:
for exp in connection.execute('SELECT sql FROM sqlite_master;'):
    print(exp[0])

CREATE TABLE "dataset" (
	"Name"	TEXT,
	"Twitter_username"	TEXT,
	"Account_start_time"	TEXT,
	"Account_ID"	INTEGER,
	"Sex"	TEXT,
	"Birthplace"	TEXT,
	"Birthday"	TEXT,
	"Age"	INTEGER,
	"Instagram_username"	TEXT,
	"Political_party"	TEXT
)
CREATE TABLE "hashtag_donaldtrump" (
	"created_at"	TEXT,
	"tweet_id"	INTEGER,
	"tweet"	TEXT,
	"likes"	INTEGER,
	"retweet_count"	INTEGER,
	"source"	TEXT,
	"user_id"	INTEGER,
	"user_name"	TEXT,
	"user_screen_name"	TEXT,
	"user_description"	TEXT,
	"user_join_date"	TEXT,
	"user_followers_count"	INTEGER,
	"user_location"	TEXT,
	"lat"	INTEGER,
	"long"	INTEGER,
	"city"	TEXT,
	"country"	TEXT,
	"continent"	TEXT,
	"state"	TEXT,
	"state_code"	TEXT,
	"collected_at"	TEXT
)
CREATE TABLE "hashtag_joebiden" (
	"created_at"	TEXT,
	"tweet_id"	INTEGER,
	"tweet"	TEXT,
	"likes"	INTEGER,
	"retweet_count"	INTEGER,
	"source"	TEXT,
	"user_id"	INTEGER,
	"user_name"	TEXT,
	"user_screen_name"	TEXT,
	"user_description"	TEXT,
	"user_join_date"	TEXT,
	"user_followers_count"	INTEGER,


### Here's what our tables look like:

In [9]:
query = """SELECT * FROM dataset LIMIT 5;"""
pd.read_sql(query, engine)

Unnamed: 0,Name,Twitter_username,Account_start_time,Account_ID,Sex,Birthplace,Birthday,Age,Instagram_username,Political_party
0,A. Donald McEachin,RepMcEachin,2017-01-03T00:00:00Z,816181091673448448,male,Germany,1961-10-10T00:00:00Z,59,repmceachin,Democratic Party
1,Aaron Michlewitz,RepMichlewitz,2010-06-27T00:00:00Z,160246973,male,United States of America,1978-01-01T00:00:00Z,42,,Democratic Party
2,Aaron Peskin,AaronPeskin,2010-11-13T00:00:00Z,215369273,male,United States of America,1964-06-17T00:00:00Z,56,apeskin52,Democratic Party
3,Aaron Peña,AaronPena,2007-10-31T00:00:00Z,9843332,male,United States of America,1959-06-08T00:00:00Z,61,,Republican Party
4,Aaron Schock,aaronschock,2009-03-12T00:00:00Z,23951197,male,United States of America,1981-05-28T00:00:00Z,39,aaronschock,Republican Party


In [10]:
query_1 = """SELECT * FROM hashtag_donaldtrump LIMIT 5;"""
pd.read_sql(query_1, engine)

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at
0,2020-10-15 00:00:01,1316529221557252096,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0,0,TweetDeck,360666534,El Sol Latino News,elsollatinonews,🌐 Noticias de interés para latinos de la costa...,...,1860,"Philadelphia, PA / Miami, FL",25.77427,-80.19366,,United States of America,North America,Florida,FL,2020-10-21 00:00:00
1,2020-10-15 00:00:01,1316529222748430336,"Usa 2020, Trump contro Facebook e Twitter: cop...",26,9,Social Mediaset,331617619,Tgcom24,MediasetTgcom24,Profilo ufficiale di Tgcom24: tutte le notizie...,...,1067661,,,,,,,,,2020-10-21 00:00:00.373216530
2,2020-10-15 00:00:02,1316529228091846912,"#Trump: As a student I used to hear for years,...",2,1,Twitter Web App,8436472,snarke,snarke,"Will mock for food! Freelance writer, blogger,...",...,1185,Portland,45.520247,-122.674195,Portland,United States of America,North America,Oregon,OR,2020-10-21 00:00:00.746433060
3,2020-10-15 00:00:02,1316529227471237120,2 hours since last tweet from #Trump! Maybe he...,0,0,Trumpytweeter,828355589206056960,Trumpytweeter,trumpytweeter,"If he doesn't tweet for some time, should we b...",...,32,,,,,,,,,2020-10-21 00:00:01.119649591
4,2020-10-15 00:00:08,1316529252301451264,You get a tie! And you get a tie! #Trump ‘s ra...,4,3,Twitter for iPhone,47413798,Rana Abtar - رنا أبتر,Ranaabtar,"Washington Correspondent, Lebanese-American ,c...",...,5393,Washington DC,38.894992,-77.036558,Washington,United States of America,North America,District of Columbia,DC,2020-10-21 00:00:01.492866121


In [11]:
query_2 = """SELECT * FROM hashtag_joebiden LIMIT 5"""
pd.read_sql(query_2, engine)

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at
0,2020-10-15 00:00:01,1316529221557252096,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0,0,TweetDeck,360666534,El Sol Latino News,elsollatinonews,🌐 Noticias de interés para latinos de la costa...,...,1860,"Philadelphia, PA / Miami, FL",25.77427,-80.19366,,United States of America,North America,Florida,FL,2020-10-21 00:00:00
1,2020-10-15 00:00:18,1316529295859290112,#HunterBiden #HunterBidenEmails #JoeBiden #Joe...,0,0,Twitter for iPad,809904438,Cheri A. 🇺🇸,Biloximeemaw,"Locked and loaded Meemaw. Love God, my family ...",...,6628,,,,,,,,,2020-10-21 00:00:00.517827283
2,2020-10-15 00:00:20,1316529305006952448,@IslandGirlPRV @BradBeauregardJ @MeidasTouch T...,0,0,Twitter Web App,3494182277,Flag Waver,Flag_Wavers,,...,1536,Golden Valley Arizona,46.304036,-109.171431,,United States of America,North America,Montana,MT,2020-10-21 00:00:01.035654566
3,2020-10-15 00:00:21,1316529308081557504,@chrislongview Watching and setting dvr. Let’s...,0,0,Twitter for iPhone,824259601201852416,Michelle Ferg,MichelleFerg4,,...,27,,,,,,,,,2020-10-21 00:00:01.553481849
4,2020-10-15 00:00:22,1316529312741253120,#censorship #HunterBiden #Biden #BidenEmails #...,1,0,Twitter Web App,1032806955356545024,the Gold State,theegoldstate,A Silicon Valley #independent #News #Media #St...,...,390,"California, USA",36.701463,-118.755997,,United States of America,North America,California,CA,2020-10-21 00:00:02.071309132


### Part 1: Using SQL to explore our data
SQL can be a great way to explore the tables in our database and learn a little more about them. In this section you will need to use clauses such as WHERE, ORDER BY, and LIMIT along with arithmetic operations.

i) How many records are in the 'dataset' table?

In [12]:
beginner_query = """SELECT COUNT(*) FROM dataset; """
pd.read_sql(beginner_query, engine)

Unnamed: 0,COUNT(*)
0,2514


ii) Find the total number of tweets about Donald Trump that had more than 20 likes

In [13]:
warmup_query = """SELECT COUNT(*) FROM hashtag_donaldtrump WHERE likes > 20;"""
pd.read_sql(warmup_query, engine)

Unnamed: 0,COUNT(*)
0,35472


iii) Fill in the quotations for "avg_query" with the SQL query that would return the average number of likes for the Donald Trump tweets

In [14]:
avg_query = """SELECT AVG(likes) FROM hashtag_donaldtrump;"""
pd.read_sql(avg_query, engine)

Unnamed: 0,AVG(likes)
0,7.476039


iv) Repeat for the biden tweets

In [15]:
avg_joe = """SELECT AVG(likes) FROM hashtag_joebiden;"""
pd.read_sql(avg_joe, engine)

Unnamed: 0,AVG(likes)
0,10.160798


v) Repeat the above two queries, but for the retweet counts

In [16]:
retweet_trump = """SELECT AVG(retweet_count) FROM hashtag_donaldtrump;"""
pd.read_sql(retweet_trump, engine)

Unnamed: 0,AVG(retweet_count)
0,6950235000000.0


In [17]:
retweet_biden = """SELECT AVG(retweet_count) FROM hashtag_joebiden;"""
pd.read_sql(retweet_biden, engine)

Unnamed: 0,AVG(retweet_count)
0,1651482000000.0


vi) Hm... these average retweet counts seem a little crazy. Why do you think this is? Begin with the Donald Trump tweets and provide some queries to explain the nature of this extremely high average.

In [18]:
### ORDER BY clauses help us to allocate outliers
### LIMIT helps us so we can see only the values at the top whereas without it we would only get the first 4 and last 4 values
ordered_query = """SELECT retweet_count FROM hashtag_donaldtrump WHERE retweet_count > 0
ORDER BY retweet_count DESC LIMIT 10;"""
pd.read_sql(ordered_query, engine)

Unnamed: 0,retweet_count
0,1322588453280706560
1,1307989018173472768
2,1283028275988770816
3,1164609333247787008
4,834998734429683712
5,834998734429683712
6,126002903
7,20491
8,12181
9,9572


In [19]:
### From the above numbers we can see that there are 6 outliers in our dataset

vii) Explore the data and think of a way in which you could appropriately remove the outliers within this column to get a more reliable figure. Explain what you did and why (use as many cells as you need).

In [20]:
### Easiest solution is just to get the average of all of the tweets that are lower than the smallest outlier - 126002903
viable_soln0 = """SELECT AVG(retweet_count) FROM hashtag_donaldtrump WHERE retweet_count < 126002903;"""
pd.read_sql(viable_soln0, engine)

Unnamed: 0,AVG(retweet_count)
0,1.699144


In [21]:
### Another solution might be to do some googling and find that the highest retweeted tweet got 4.2 million retweets
### You can use this to build a threshold for which retweet_counts you deem viable
viable_soln1 = """SELECT AVG(retweet_count) FROM hashtag_donaldtrump WHERE retweet_count < 4200000;"""
pd.read_sql(viable_soln1, engine)

Unnamed: 0,AVG(retweet_count)
0,1.699144


In [22]:
### The solution should involve queries allocating the outlier tweets and appropriately removing them thus ending in a final
### answer of 1.699144

viii) Repeat your above process, but for the Biden tweets

In [23]:
ordered_query1 = """SELECT retweet_count FROM hashtag_joebiden WHERE retweet_count > 0
ORDER BY retweet_count DESC LIMIT 10;"""
pd.read_sql(ordered_query1, engine)

Unnamed: 0,retweet_count
0,1283028275988770816
1,376748694
2,126002903
3,63473
4,20615
5,18272
6,17893
7,17652
8,15859
9,14024


In [24]:
### Solution to this question must be less than or equal to the answer provided below
real_avg_joe = """SELECT AVG(retweet_count) FROM hashtag_joebiden WHERE retweet_count < 126002903;"""
pd.read_sql(real_avg_joe, engine)

Unnamed: 0,AVG(retweet_count)
0,2.131691


ix) Tricky puzzle: Use a SQL Query to derive the variance of the retweet_count for the Trump tweets. 

$\textbf{Hint:}$ The version of SQL supported by sqlalchemy does not have a built-in VARIANCE() function therefore you will have to get a little creative.

In [25]:
var_query = """SELECT AVG(retweet_count*retweet_count) - AVG(retweet_count)*AVG(retweet_count) FROM hashtag_donaldtrump;"""
pd.read_sql(var_query, engine)

Unnamed: 0,AVG(retweet_count*retweet_count) - AVG(retweet_count)*AVG(retweet_count)
0,8.092165999999999e+30


In [26]:
### Make sure that that's correct
all_trump = """SELECT retweet_count FROM hashtag_donaldtrump"""
np.var(pd.read_sql(all_trump, engine)['retweet_count'])

8.092165799291619e+30

x) Final Boss Puzzle

Part 1: Find the number of tweets that are below the average retweet count that you found in part vii for the Trump tweets

In [27]:
below_average = """SELECT COUNT(*) FROM hashtag_donaldtrump 
WHERE retweet_count < (SELECT AVG(retweet_count) FROM hashtag_donaldtrump WHERE retweet_count < 126002903)"""
pd.read_sql(below_average, engine)

Unnamed: 0,COUNT(*)
0,878040


Part 2: Find the ratio between the number of tweets, excluding the outliers you found in part vii, that get below average retweet counts and the total number of tweets, excluding the outliers you found in part vii. 

$\textbf{Note:}$ sqlalchemy does not support '/' division, so just have your query return the numerator and denominator as separate values in a table. Make sure the ratio between the two values is equal to .904332565

In [28]:
ratio_query = """SELECT below_average.counted, excluded.Total FROM 
(SELECT COUNT(*) AS counted FROM hashtag_donaldtrump WHERE retweet_count < 
(SELECT AVG(retweet_count) FROM hashtag_donaldtrump WHERE retweet_count < 126002903)) AS below_average,
(SELECT COUNT(retweet_count) AS Total FROM hashtag_donaldtrump WHERE retweet_count < 126002903) AS excluded;"""
pd.read_sql(ratio_query, engine)

Unnamed: 0,counted,Total
0,878040,970926


### Part 2: Joins
In this section you will need to use INNER JOIN, LEFT JOIN, RIGHT JOIN, GROUP BY, DISTINCT, ORDER BY, Aliases, as well as Subqueries.

i) Basic Inner Join: Find all of the politicians who tweeted about Donald Trump. Make sure there is only one row per politician. Answer this question first and use as many cells as needed: Which column should we be joining on and why? How can you tell?

$\textbf{Hint:}$ how can you use other fields to assess the validity of a column to join on between the two tables?

In [29]:
basic_join = """SELECT DISTINCT Name, user_join_date, Account_start_time, tweet FROM dataset INNER JOIN hashtag_donaldtrump
ON hashtag_donaldtrump.user_id = dataset.Account_ID;"""
pd.read_sql(basic_join, engine)

Unnamed: 0,Name,user_join_date,Account_start_time,tweet
0,Alan Grayson,2009-05-19 01:00:40,2009-05-18T00:00:00Z,"After giving this careful consideration, I hav..."
1,Alan Grayson,2009-05-19 01:00:40,2009-05-18T00:00:00Z,The Washington Post's Trump lie list:\nhttps:/...
2,Alan Grayson,2009-05-19 01:00:40,2009-05-18T00:00:00Z,The most corrupt President in history is makin...
3,Alan Grayson,2009-05-19 01:00:40,2009-05-18T00:00:00Z,"To avoid prosecution, Trump's next address wil..."
4,Bobby Rush,2011-05-25 20:59:25,2011-05-25T00:00:00Z,"Today, the #Trump @FCC doubled down on its dec..."
...,...,...,...,...
64,Steve Cohen,2010-07-02 15:10:21,2010-07-02T00:00:00Z,The #2020Census deadline is TODAY! \n\nIf just...
65,Steve Cohen,2010-07-02 15:10:21,2010-07-02T00:00:00Z,The #WhiteHouse ranked “ending the #COVID19 pa...
66,Steve Cohen,2010-07-02 15:10:21,2010-07-02T00:00:00Z,The Intelligence Director Who Is Undermining T...
67,Steve Cohen,2010-07-02 15:10:21,2010-07-02T00:00:00Z,Trump weighs firing FBI director after electio...


In [30]:
basic_join2 = """SELECT DISTINCT Name, user_join_date, Account_start_time, tweet FROM dataset INNER JOIN hashtag_donaldtrump
ON hashtag_donaldtrump.user_name = dataset.twitter_username;"""
pd.read_sql(basic_join2, engine)

Unnamed: 0,Name,user_join_date,Account_start_time,tweet
0,Alexandria Ocasio-Cortez,2017-10-16 19:18:02,2018-12-29T00:00:00Z,"""Et si les détracteurs de #Trump, dans leurs r..."
1,Alexandria Ocasio-Cortez,2017-10-16 19:18:02,2018-12-29T00:00:00Z,Au sommaire du quotidien d'idées AOC ce jeudi ...
2,Alexandria Ocasio-Cortez,2017-10-16 19:18:02,2018-12-29T00:00:00Z,Au sommaire du quotidien d'idées AOC ce jeudi ...
3,Alexandria Ocasio-Cortez,2017-10-16 19:18:02,2018-12-29T00:00:00Z,Au sommaire du quotidien d'idées AOC ce lundi ...
4,Alexandria Ocasio-Cortez,2017-10-16 19:18:02,2018-12-29T00:00:00Z,Au sommaire du quotidien d'idées AOC ce mardi ...
5,Alexandria Ocasio-Cortez,2017-10-16 19:18:02,2010-04-28T00:00:00Z,"""Et si les détracteurs de #Trump, dans leurs r..."
6,Alexandria Ocasio-Cortez,2017-10-16 19:18:02,2010-04-28T00:00:00Z,Au sommaire du quotidien d'idées AOC ce jeudi ...
7,Alexandria Ocasio-Cortez,2017-10-16 19:18:02,2010-04-28T00:00:00Z,Au sommaire du quotidien d'idées AOC ce jeudi ...
8,Alexandria Ocasio-Cortez,2017-10-16 19:18:02,2010-04-28T00:00:00Z,Au sommaire du quotidien d'idées AOC ce lundi ...
9,Alexandria Ocasio-Cortez,2017-10-16 19:18:02,2010-04-28T00:00:00Z,Au sommaire du quotidien d'idées AOC ce mardi ...


In [31]:
### Clearly the first table provides more of the content that we are looking for.
### Any viable use of the other use backed by reasonable explanation earns points.
### Using the other fields to ensure that the join_date and the start_time match up along with looking at the tweets provides us
### with enough information to go with the first option of using the user_id from the hashtag_donaldtrump table and the
### Account_ID from the hashtag_donaldtrump table.

In [32]:
solution_join = """SELECT DISTINCT Name FROM dataset INNER JOIN hashtag_donaldtrump ON
hashtag_donaldtrump.user_id = dataset.Account_ID;"""
pd.read_sql(solution_join, engine)

Unnamed: 0,Name
0,Alan Grayson
1,Bobby Rush
2,Brian Higgins
3,Brune Poirson
4,Daylin Leach
5,Donald Norcross
6,Donna Brazile
7,Eleanor Holmes Norton
8,Elly Schlein
9,Filemon Vela


$\textbf{True or False:}$ if we were to switch the order of the tables in the above query, the output would change. Explain your answer.

False; because we are using an inner join the output we are not favoring one table over the other. In other words, we are not keeping certain rows with no regard as to whether the predicate is met. The only time rows are added to the resulting table of this query is when the predicate is met and therefore the order of the tables does not matter.

ii) Write a query listing the names of the politicians who tweeted about Biden.

In [33]:
biden_join = """SELECT DISTINCT Name FROM dataset INNER JOIN hashtag_joebiden ON
hashtag_joebiden.user_id = dataset.Account_ID;"""
pd.read_sql(biden_join, engine)

Unnamed: 0,Name
0,Abby Finkenauer
1,Alan Grayson
2,Brendan F. Boyle
3,Brian Sims
4,Brune Poirson
5,Carly Fiorina
6,Claudia Tenney
7,Craig Fitzhugh
8,Dan Lipinski
9,Daylin Leach


iii) How many records do the Biden tweets table and the politician tweets table share? Answer using a query.

In [34]:
basic_count = """SELECT COUNT(*) FROM dataset INNER JOIN hashtag_joebiden ON 
hashtag_joebiden.user_id = dataset.Account_ID;"""
pd.read_sql(basic_count, engine)

Unnamed: 0,COUNT(*)
0,84


iv) How many records do the Trump tweets table and politician tweets table share? Answer using a query.

In [35]:
basic_trump_count = """SELECT COUNT(*) FROM dataset INNER JOIN hashtag_donaldtrump ON 
hashtag_donaldtrump.user_id = dataset.Account_ID;"""
pd.read_sql(basic_trump_count, engine)

Unnamed: 0,COUNT(*)
0,85


v) Now return the number of tweets, in descending order, for each politician that tweeted about Trump.

In [36]:
num_tweets_join = """SELECT Name, COUNT(Name) AS number_of_tweets FROM dataset INNER JOIN hashtag_donaldtrump 
ON hashtag_donaldtrump.user_id = dataset.Account_ID
GROUP BY Name
ORDER BY number_of_tweets DESC;"""
pd.read_sql(num_tweets_join, engine)

Unnamed: 0,Name,number_of_tweets
0,Steve Cohen,23
1,Norma Torres,16
2,Daylin Leach,7
3,Jack Kingston,5
4,Alan Grayson,4
5,Rory Lancman,3
6,Mark DeSaulnier,3
7,Kevin Brady,3
8,Jean Quan,3
9,Richard Painter,2


vi) Write a query that results in the average number of retweets, in ascending order, that each politician received when tweeting about Donald Trump along with their Political Party.

In [37]:
comp_join = """SELECT Name, AVG(retweet_count) AS avg_retweet, Political_Party FROM dataset INNER JOIN hashtag_donaldtrump
ON hashtag_donaldtrump.user_id = dataset.Account_ID
GROUP BY Name
ORDER BY avg_retweet;"""
pd.read_sql(comp_join, engine)

Unnamed: 0,Name,avg_retweet,Political_party
0,Rory Lancman,0.333333,Democratic Party
1,Donald Norcross,1.0,Democratic Party
2,Jim Langevin,1.0,Democratic Party
3,Jack Kingston,1.2,Republican Party
4,Jean Quan,1.666667,Democratic Party
5,Brian Higgins,3.0,Democratic Party
6,Kathy Castor,3.0,Democratic Party
7,Keith Rothfus,3.0,Republican Party
8,Alan Grayson,3.75,Democratic Party
9,Bobby Rush,4.0,Democratic Party


vii) Write a query that results in a table showing the politicians who tweeted about Trump and Biden during the election

$\textbf{Note:}$ This query can be written many different ways, but you can cross-reference your two resulting tables in parts i and ii to make sure that you have a correct output.

In [38]:
both_candidate_query = """SELECT DISTINCT biden_tweets.Name FROM 
(SELECT Account_ID, Name FROM dataset INNER JOIN hashtag_donaldtrump
ON dataset.Account_ID = hashtag_donaldtrump.user_id) AS trump_tweets INNER JOIN
(SELECT Account_ID, Name FROM dataset INNER JOIN hashtag_joebiden ON hashtag_joebiden.user_id = dataset.Account_ID)
AS biden_tweets
ON biden_tweets.Account_ID = trump_tweets.Account_ID;
"""
pd.read_sql(both_candidate_query, engine)

Unnamed: 0,Name
0,Alan Grayson
1,Brune Poirson
2,Daylin Leach
3,Eleanor Holmes Norton
4,Elly Schlein
5,Jack Kingston
6,Jean Quan
7,Kevin Brady
8,Steve Cohen


viii) Part 1: Write a query such that it returns a table with all of the politicians who tweeted during the election along with the average retweet count and average likes of those who tweeted about Donald Trump. Order first by retweet count in descending order breaking ties by Name in descending order. Make sure there is one row per Politician.

In [40]:
left_join_basic = """SELECT Name, AVG(retweet_count) AS AVG_RT, AVG(likes) AS AVG_likes
FROM dataset LEFT JOIN hashtag_donaldtrump
ON dataset.Account_ID = hashtag_donaldtrump.user_id
GROUP BY Name
ORDER BY AVG_RT DESC, Name DESC;"""
pd.read_sql(left_join_basic, engine)

Unnamed: 0,Name,AVG_RT,AVG_likes
0,Richard Painter,271.0,574.000000
1,Elly Schlein,216.0,2437.000000
2,Eleanor Holmes Norton,59.0,229.000000
3,Kevin Brady,28.0,86.666667
4,Brune Poirson,27.0,115.000000
...,...,...,...
1801,Aaron Schock,,
1802,Aaron Peña,,
1803,Aaron Peskin,,
1804,Aaron Michlewitz,,


Part 2: Write a query that can assure you that your answer to Part 1 has the correct number of records.

In [41]:
check_query = """SELECT COUNT(*) FROM dataset GROUP BY Name;"""
pd.read_sql(check_query, engine)

Unnamed: 0,COUNT(*)
0,1
1,1
2,1
3,1
4,1
...,...
1801,4
1802,1
1803,1
1804,1
