In [1]:
import pandas as pd
import sqlite3

data = pd.read_csv('./data/tips.csv')

conn = sqlite3.connect('tips_database.db')
data.to_sql('tips', conn, if_exists='replace', index=False)
cursor = conn.cursor()

In [9]:
cursor.execute("SELECT day, round(avg(tip/total_bill)*100,2) as tip_pr FROM tips group by day order by 1;")
rows = cursor.fetchall()
print("Retrieve the average tip percentage for each day of the week")
pd.DataFrame(rows,columns=[description[0] for description in cursor.description]).head()

Retrieve the average tip percentage for each day of the week


Unnamed: 0,day,tip_pr
0,Fri,16.99
1,Sat,15.32
2,Sun,16.69
3,Thur,16.13


In [14]:
cursor.execute("SELECT max(total_bill),min(total_bill) FROM tips;")
rows = cursor.fetchall()
print("Find the maximum and minimum total bull amounts")
pd.DataFrame(rows,columns=[description[0] for description in cursor.description]).head()

Find the maximum and minimum total bull amounts


Unnamed: 0,max(total_bill),min(total_bill)
0,50.81,3.07


In [26]:
cursor.execute("SELECT size, count(*) FROM tips group by size order by 1;")
rows = cursor.fetchall()
print("Count the number of parties for each size")
pd.DataFrame(rows,columns=[description[0] for description in cursor.description]).head(50)

Count the number of parties for each size


Unnamed: 0,size,count(*)
0,1,4
1,2,156
2,3,38
3,4,37
4,5,5
5,6,4


In [16]:
cursor.execute("SELECT sum(total_bill), sum(tip) FROM tips where size>=4 and (tip/total_bill)*100>15;")
rows = cursor.fetchall()
print("Retrieve the total bill and tip for parties of size 4 or more, where the tip percentage is greater than 15%")
pd.DataFrame(rows,columns=[description[0] for description in cursor.description]).head()

Retrieve the total bill and tip for parties of size 4 or more, where the tip percentage is greater than 15%


Unnamed: 0,sum(total_bill),sum(tip)
0,471.03,90.06


In [25]:
cursor.execute("SELECT day, time, sum(total_bill), sum(tip), round(sum(tip)/sum(total_bill)*100,2) as tip_pr FROM tips group by day, time order by 5 desc;")
rows = cursor.fetchall()
print("Retrieve the total bill, tip amount, and tip percentage for each combination of day and time, sorted by tip percentage in descending order")
pd.DataFrame(rows,columns=[description[0] for description in cursor.description]).head(50)

Retrieve the total bill, tip amount, and tip percentage for each combination of day and time, sorted by tip percentage in descending order


Unnamed: 0,day,time,sum(total_bill),sum(tip),tip_pr
0,Fri,Lunch,89.92,16.68,18.55
1,Thur,Dinner,18.78,3.0,15.97
2,Thur,Lunch,1077.55,168.83,15.67
3,Sun,Dinner,1627.16,247.39,15.2
4,Fri,Dinner,235.96,35.28,14.95
5,Sat,Dinner,1778.4,260.4,14.64


In [24]:
cursor.execute("SELECT day, time, smoker, round(avg(tip/total_bill)*100,2) as tip_pr FROM tips group by 1,2,3;")
rows = cursor.fetchall()
print("Find the average tip percentage for each combination of day, time, and smoker status")
pd.DataFrame(rows,columns=[description[0] for description in cursor.description]).head(50)

Find the average tip percentage for each combination of day, time, and smoker status


Unnamed: 0,day,time,smoker,tip_pr
0,Fri,Dinner,No,13.96
1,Fri,Dinner,Yes,16.53
2,Fri,Lunch,No,18.77
3,Fri,Lunch,Yes,18.89
4,Sat,Dinner,No,15.8
5,Sat,Dinner,Yes,14.79
6,Sun,Dinner,No,16.01
7,Sun,Dinner,Yes,18.73
8,Thur,Dinner,No,15.97
9,Thur,Lunch,No,16.03


In [19]:
cursor.execute("SELECT sex, sum(total_bill), sum(tip), round(sum(tip)/sum(total_bill)*100,2) as tip_pr FROM tips group by 1 order by 2 desc limit 5;")
rows = cursor.fetchall()
print("Retrieve the total bill, tip amount, and tip percentage for each sex, sorted by total bill in descending order, and limit the results to the top 5 records")
pd.DataFrame(rows,columns=[description[0] for description in cursor.description]).head()

Retrieve the total bill, tip amount, and tip percentage for each sex, sorted by total bill in descending order, and limit the results to the top 5 records


Unnamed: 0,sex,sum(total_bill),sum(tip),tip_pr
0,Male,3256.82,485.07,14.89
1,Female,1570.95,246.51,15.69


In [2]:
query = """
SELECT day, time, 
       MAX(CASE WHEN rank_max = 1 THEN tip_percentage END) AS max_tip_percentage,
       MAX(CASE WHEN rank_max = 1 THEN total_bill END) AS max_total_bill,
       MAX(CASE WHEN rank_max = 1 THEN tip END) AS max_tip,
       MIN(CASE WHEN rank_min = 1 THEN tip_percentage END) AS min_tip_percentage,
       MIN(CASE WHEN rank_min = 1 THEN total_bill END) AS min_total_bill,
       MIN(CASE WHEN rank_min = 1 THEN tip END) AS min_tip
FROM (
    SELECT day, time, total_bill, tip,
           (tip / total_bill) * 100 AS tip_percentage,
           ROW_NUMBER() OVER (PARTITION BY day, time ORDER BY (tip / total_bill) * 100 DESC) AS rank_max,
           ROW_NUMBER() OVER (PARTITION BY day, time ORDER BY (tip / total_bill) * 100 ASC) AS rank_min
    FROM tips
)
GROUP BY day, time;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Find the maximum and minimum tip percentage for each day and time combination, along with the corresponding total bill and tip amount")
pd.DataFrame(rows,columns=[description[0] for description in cursor.description]).head(50)


Find the maximum and minimum tip percentage for each day and time combination, along with the corresponding total bill and tip amount


Unnamed: 0,day,time,max_tip_percentage,max_total_bill,max_tip,min_tip_percentage,min_total_bill,min_tip
0,Fri,Dinner,26.348039,16.32,4.3,10.35554,28.97,3.0
1,Fri,Lunch,25.931446,13.42,3.48,11.773472,13.42,1.58
2,Sat,Dinner,32.57329,3.07,1.0,3.563814,32.83,1.17
3,Sun,Dinner,71.034483,7.25,5.15,5.944673,16.99,1.01
4,Thur,Dinner,15.974441,18.78,3.0,15.974441,18.78,3.0
5,Thur,Lunch,26.631158,7.51,2.0,7.296137,18.64,1.36


In [3]:
query = """
SELECT total_bill, tip, (tip / total_bill) * 100 AS tip_pr
FROM tips
WHERE size >= 4
  AND (tip / total_bill) * 100 > 15
  AND total_bill BETWEEN 50 AND 100;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Retrieve the total bill, tip amount, and tip percentage for parties of size 4 or more, where the tip percentage is greater than 15%, and the total bill is between $50 and $100")
pd.DataFrame(rows,columns=[description[0] for description in cursor.description]).head()


Retrieve the total bill, tip amount, and tip percentage for parties of size 4 or more, where the tip percentage is greater than 15%, and the total bill is between $50 and $100


Unnamed: 0,total_bill,tip,tip_pr


In [4]:
query = """
SELECT day, time, smoker, 
       AVG((tip / total_bill) * 100) AS avg_tip_percentage,
       COUNT(*) AS record_count
FROM tips
GROUP BY 1,2,3
HAVING COUNT(*) > 5;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Find the average tip percentage for each combination of day, time, and smoker status, but only include combinations with more than 5 records")
pd.DataFrame(rows,columns=[description[0] for description in cursor.description]).head(50)


Find the average tip percentage for each combination of day, time, and smoker status, but only include combinations with more than 5 records


Unnamed: 0,day,time,smoker,avg_tip_percentage,record_count
0,Fri,Dinner,Yes,16.534736,9
1,Fri,Lunch,Yes,18.893659,6
2,Sat,Dinner,No,15.804766,45
3,Sat,Dinner,Yes,14.790607,42
4,Sun,Dinner,No,16.011294,57
5,Sun,Dinner,Yes,18.725032,19
6,Thur,Lunch,No,16.031067,44
7,Thur,Lunch,Yes,16.386327,17


In [15]:
# Add 5 queries

In [13]:
update_query = """
UPDATE tips
SET smoker = 'Yes'
WHERE total_bill = (
    SELECT total_bill FROM (
        SELECT total_bill, ROW_NUMBER() OVER (ORDER BY 1) AS row_num
        FROM tips
    )
    WHERE row_num = 10) and 
    tip = (
    SELECT tip FROM (
        SELECT tip, ROW_NUMBER() OVER (ORDER BY 1) AS row_num
        FROM tips
    )
    WHERE row_num = 10)
;
"""

cursor.execute(update_query)
conn.commit()

In [5]:
delete_query = """
DELETE FROM tips
WHERE total_bill < 10;
"""

cursor.execute(delete_query)
conn.commit()