## TODO

average tip percentage 的计算方式：mean(tip/bill) 还是 sum(tip)/sum(bill), 这两个数值应该不一样，最好找助教或者教授确认一下。

## Helper functions

In [1]:
import json
from typing import Any


def pretty_print(data: Any):
    """
    print data in a pretty way that can be viewed more clean.
    """

    print(json.dumps(data, ensure_ascii=False, indent=4))

## CREATE

### Create Database

In [2]:
# Just in case: remove old database file
# NOTE: Maybe we can use memory database instead
!rm XKJX.db

In [3]:
import sqlite3

con = sqlite3.connect("XKJX.db")

### Create table

In [4]:
# table header of csv (for field name reference): total_bill,tip,sex,smoker,day,time,size
# data sample of csv (for value type reference): 16.99,1.01,Female,No,Sun,Dinner,2
sql = """
    CREATE TABLE tips (
        id INTEGER PRIMARY KEY,
        total_bill FLOAT,
        tip FLOAT,
        sex CHAR,
        smoker CHAR,
        day CHAR,
        time CHAR,
        size INT
    );
"""

try:
    with con:
        con.execute(sql)
except sqlite3.Error as e:
    print("couldn't create table: ", e)
else:
    print("Success: table created!")

Success: table created!


### Insert data

Insert data item using the data from the sample .csv file.

In [5]:
import csv

sql = """
    INSERT INTO
        tips
    VALUES
        (
            NULL,
            :total_bill,
            :tip,
            :sex,
            :smoker,
            :day,
            :time,
            :size
        )
    ;
"""

with open("data/tips.csv") as fd:
    reader = csv.DictReader(fd)
    for row in reader:
        # insert into table
        try:
            with con:
                con.execute(sql, row)
        except sqlite3.Error as e:
            print("couldn't insert data item: ", e)

print("Success: all data inserted!")

Success: all data inserted!


## READ

### 1. Retrieve the average tip percentage for each day of the week 

In [6]:
sql = """
    SELECT
        AVG(tip/total_bill) * 100 AS tip_percentage,
        day
    FROM
        tips
    GROUP BY
        day
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for avg_tip_percentage, day in res.fetchall():
        data_point = {"day": day, "avg_tip_percentage": avg_tip_percentage}
        data.append(data_point)

    pretty_print(data)

[
    {
        "day": "Fri",
        "avg_tip_percentage": 16.991302873347887
    },
    {
        "day": "Sat",
        "avg_tip_percentage": 15.31517163877781
    },
    {
        "day": "Sun",
        "avg_tip_percentage": 16.689728635113457
    },
    {
        "day": "Thur",
        "avg_tip_percentage": 16.127563396664705
    }
]


### 2. Find the maximum and minimum total bull amounts

In [7]:
sql = """
    SELECT
        MIN(total_bill) AS min_bill,
        MAX(total_bill) AS max_bill
    FROM
        tips
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    min_bill, max_bill = res.fetchone()
    data_point = {"min_bill": min_bill, "max_bill": max_bill}
    pretty_print(data_point)

{
    "min_bill": 3.07,
    "max_bill": 50.81
}


### 3. Count the number of parties for each size

In [8]:
sql = """
    SELECT
        size,
        COUNT(*) AS party_num
    FROM
        tips 
    GROUP BY
        size
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for size, num in res.fetchall():
        data_point = {"party_size": size, "num": num}
        data.append(data_point)

    pretty_print(data)

[
    {
        "party_size": 1,
        "num": 4
    },
    {
        "party_size": 2,
        "num": 156
    },
    {
        "party_size": 3,
        "num": 38
    },
    {
        "party_size": 4,
        "num": 37
    },
    {
        "party_size": 5,
        "num": 5
    },
    {
        "party_size": 6,
        "num": 4
    }
]


### 4. Retrieve the total bill and tip for parties of size 4 or more, where the tip percentage is greater than 15%

In [9]:
sql = """
    SELECT
        SUM(total_bill),
        SUM(tip)
    FROM
        tips 
    WHERE
        size >=4
        AND (tip / total_bill) * 100 > 15
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    bill, tip = res.fetchone()
    data_point = {"total_bill": bill, "total_tip": tip}
    pretty_print(data_point)

{
    "total_bill": 471.03000000000003,
    "total_tip": 90.06
}


### 5. Retrieve the total bill, tip amount, and tip percentage for each combination of day and time, sorted by tip percentage in descending order

In [10]:
sql = """
    SELECT
        day,
        time,
        SUM(total_bill),
        SUM(tip),
        AVG(tip/total_bill) * 100
    FROM
        tips
    GROUP BY
        day,
        time
    ORDER BY
        AVG(tip/total_bill) DESC
    ;
"""


try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for day, time, bill, total_tip, tip_percentage in res.fetchall():
        data_point = {
            "day": day,
            "time": time,
            "total_bill": bill,
            "total_tip": total_tip,
            "tip_precentage": tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "day": "Fri",
        "time": "Lunch",
        "total_bill": 89.92,
        "total_tip": 16.68,
        "tip_precentage": 18.876488829484337
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "total_bill": 1627.1600000000003,
        "total_tip": 247.39000000000007,
        "tip_precentage": 16.689728635113457
    },
    {
        "day": "Thur",
        "time": "Lunch",
        "total_bill": 1077.5499999999997,
        "total_tip": 168.83,
        "tip_precentage": 16.1300736016171
    },
    {
        "day": "Thur",
        "time": "Dinner",
        "total_bill": 18.78,
        "total_tip": 3.0,
        "tip_precentage": 15.974440894568689
    },
    {
        "day": "Fri",
        "time": "Dinner",
        "total_bill": 235.95999999999998,
        "total_tip": 35.28,
        "tip_precentage": 15.891611065601627
    },
    {
        "day": "Sat",
        "time": "Dinner",
        "total_bill": 1778.3999999999996,
        "total_tip": 260.4,
        "t

### 6. Find the average tip percentage for each combination of day, time, and smoker status

In [11]:
sql = """
    SELECT
        day,
        time,
        smoker,
        AVG(tip/total_bill) * 100
    FROM
        tips
    GROUP BY
        day,
        time,
        smoker
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for day, time, smoker, avg_tip_percentage in res.fetchall():
        data_point = {
            "day": day,
            "time": time,
            "smoker": smoker,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "day": "Fri",
        "time": "Dinner",
        "smoker": "No",
        "avg_tip_percentage": 13.962236590021224
    },
    {
        "day": "Fri",
        "time": "Dinner",
        "smoker": "Yes",
        "avg_tip_percentage": 16.534735890795094
    },
    {
        "day": "Fri",
        "time": "Lunch",
        "smoker": "No",
        "avg_tip_percentage": 18.773466833541928
    },
    {
        "day": "Fri",
        "time": "Lunch",
        "smoker": "Yes",
        "avg_tip_percentage": 18.893659162141404
    },
    {
        "day": "Sat",
        "time": "Dinner",
        "smoker": "No",
        "avg_tip_percentage": 15.804765754377618
    },
    {
        "day": "Sat",
        "time": "Dinner",
        "smoker": "Yes",
        "avg_tip_percentage": 14.790606514920887
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "smoker": "No",
        "avg_tip_percentage": 16.011294305072806
    },
    {
        "day": "Sun",
        "time": "Dinner",
     

### 7. Retrieve the total bill, tip amount, and tip percentage for each sex, sorted by total bill in descending order, and limit the results to the top 5 records

In [12]:
sql = """
    SELECT 
        sex,
        total_bill,
        tip,
        (tip / total_bill) * 100 AS tip_percentage
    FROM 
        tips
    WHERE
        sex=:sex
    ORDER BY 
        total_bill DESC
    LIMIT
        5
    ;
"""


for sex in ("Male", "Female"):

    try:
        with con:
            res = con.execute(sql, {"sex": sex})
    except sqlite3.Error as e:
        print("couldn't execute sql: ", e)
    else:
        data = []

        for sex, total_bill, tip, tip_percentage in res.fetchall():
            data_point = {
                "sex": sex,
                "total_bill": total_bill,
                "tip": tip,
                "tip_percentage": tip_percentage,
            }
            data.append(data_point)

        pretty_print(data)

[
    {
        "sex": "Male",
        "total_bill": 50.81,
        "tip": 10.0,
        "tip_percentage": 19.681165124975397
    },
    {
        "sex": "Male",
        "total_bill": 48.33,
        "tip": 9.0,
        "tip_percentage": 18.6219739292365
    },
    {
        "sex": "Male",
        "total_bill": 48.27,
        "tip": 6.73,
        "tip_percentage": 13.942407292314066
    },
    {
        "sex": "Male",
        "total_bill": 48.17,
        "tip": 5.0,
        "tip_percentage": 10.379904504878555
    },
    {
        "sex": "Male",
        "total_bill": 45.35,
        "tip": 3.5,
        "tip_percentage": 7.717750826901875
    }
]
[
    {
        "sex": "Female",
        "total_bill": 44.3,
        "tip": 2.5,
        "tip_percentage": 5.643340857787811
    },
    {
        "sex": "Female",
        "total_bill": 43.11,
        "tip": 5.0,
        "tip_percentage": 11.59823706796567
    },
    {
        "sex": "Female",
        "total_bill": 35.83,
        "tip": 4.67,
    

### 8. Find the maximum and minimum tip percentage for each day and time combination, along with the corresponding total bill and tip amount

In [13]:
sql_percentage = """
    SELECT
        day,
        time,
        MIN(tip/total_bill) * 100 AS min_percentage,
        MAX(tip/total_bill) * 100 AS max_percentage
    FROM
        tips
    GROUP BY
        day,
        time
    ;
"""

sql_bill_and_tip = """
    SELECT
        total_bill,
        tip
    FROM
        tips
    WHERE
        ABS((tip / total_bill) * 100 - :target_percentage) < 0.00000001
        AND day = :day
        AND time = :time
    ;
"""

data = []

try:
    with con:
        res = con.execute(sql_percentage)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    for day, time, min_percentage, max_percentage in res.fetchall():
        try:
            with con:
                res = con.execute(
                    sql_bill_and_tip,
                    {"target_percentage": min_percentage, "day": day, "time": time},
                )

                min_total_bill, min_tip = res.fetchone()
        except sqlite3.Error as e:
            print("couldn't execute sql: ", e)
            raise e

        try:
            with con:
                res = con.execute(
                    sql_bill_and_tip,
                    {"target_percentage": max_percentage, "day": day, "time": time},
                )

            max_total_bill, max_tip = res.fetchone()
        except sqlite3.Error as e:
            print("couldn't execute sql: ", e)
            raise e

        data_point = {
            "day": day,
            "time": time,
            "min_tip_percentage": min_percentage,
            "min_corresponding_total_bill": min_total_bill,
            "min_corresponding_tip": min_tip,
            "max_tip_percentage": max_percentage,
            "max_corresponding_total_bill": max_total_bill,
            "max_corresponding_tip": max_tip,
        }

        data.append(data_point)

pretty_print(data)

[
    {
        "day": "Fri",
        "time": "Dinner",
        "min_tip_percentage": 10.355540214014498,
        "min_corresponding_total_bill": 28.97,
        "min_corresponding_tip": 3.0,
        "max_tip_percentage": 26.348039215686274,
        "max_corresponding_total_bill": 16.32,
        "max_corresponding_tip": 4.3
    },
    {
        "day": "Fri",
        "time": "Lunch",
        "min_tip_percentage": 11.773472429210134,
        "min_corresponding_total_bill": 13.42,
        "min_corresponding_tip": 1.58,
        "max_tip_percentage": 25.93144560357675,
        "max_corresponding_total_bill": 13.42,
        "max_corresponding_tip": 3.48
    },
    {
        "day": "Sat",
        "time": "Dinner",
        "min_tip_percentage": 3.563813585135547,
        "min_corresponding_total_bill": 32.83,
        "min_corresponding_tip": 1.17,
        "max_tip_percentage": 32.57328990228013,
        "max_corresponding_total_bill": 3.07,
        "max_corresponding_tip": 1.0
    },
    {
    

### 9. Retrieve the total bill, tip amount, and tip percentage for parties of size 4 or more, where the tip percentage is greater than 15%, and the total bill is between $50 and $100

In [14]:
# TODO: I don't know what exactly the question is asking about, the statement is ambiguous here.

sql = """
    SELECT
        SUM(total_bill),
        SUM(tip),
        AVG(tip / total_bill) * 100
    FROM
        tips 
    WHERE
        size >=4
        AND (tip / total_bill) * 100 > 15
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for bill, tip, tip_percentage in res.fetchall():
        data_point = {
            "bill": bill,
            "tip": tip,
            "tip_percentage": tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "bill": 471.03000000000003,
        "tip": 90.06,
        "tip_percentage": 19.254073596963856
    }
]


### 10. Find the average tip percentage for each combination of day, time, and smoker status, but only include combinations with more than 5 records

In [15]:
sql = """
    SELECT
        day,
        time,
        smoker,
        AVG(tip/total_bill) * 100 AS avg_tip_percentage
    FROM
        tips
    GROUP BY
        day,
        time,
        smoker
    HAVING
        COUNT(*) > 5
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for day, time, smoker, avg_tip_percentage in res.fetchall():
        data_point = {
            "day": day,
            "time": time,
            "smoker": smoker,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "day": "Fri",
        "time": "Dinner",
        "smoker": "Yes",
        "avg_tip_percentage": 16.534735890795094
    },
    {
        "day": "Fri",
        "time": "Lunch",
        "smoker": "Yes",
        "avg_tip_percentage": 18.893659162141404
    },
    {
        "day": "Sat",
        "time": "Dinner",
        "smoker": "No",
        "avg_tip_percentage": 15.804765754377618
    },
    {
        "day": "Sat",
        "time": "Dinner",
        "smoker": "Yes",
        "avg_tip_percentage": 14.790606514920887
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "smoker": "No",
        "avg_tip_percentage": 16.011294305072806
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "smoker": "Yes",
        "avg_tip_percentage": 18.725031625235424
    },
    {
        "day": "Thur",
        "time": "Lunch",
        "smoker": "No",
        "avg_tip_percentage": 16.031066524359844
    },
    {
        "day": "Thur",
        "time": "Lunch",
   

### 11. (additional queries) average daily bill and average tip percentage of each day

In [16]:
sql = """
    SELECT
        day,
        SUM(total_bill),
        AVG(tip/total_bill) * 100
    FROM
        tips
    GROUP BY
        day
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for day, sum_total_bill, avg_tip_percentage in res.fetchall():
        data_point = {
            "day": day,
            "sum_total_bill": sum_total_bill,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "day": "Fri",
        "sum_total_bill": 325.87999999999994,
        "avg_tip_percentage": 16.991302873347887
    },
    {
        "day": "Sat",
        "sum_total_bill": 1778.3999999999996,
        "avg_tip_percentage": 15.31517163877781
    },
    {
        "day": "Sun",
        "sum_total_bill": 1627.1600000000003,
        "avg_tip_percentage": 16.689728635113457
    },
    {
        "day": "Thur",
        "sum_total_bill": 1096.3299999999997,
        "avg_tip_percentage": 16.127563396664705
    }
]


### 12. (additional queries) average tip percentage of bills below (exclusive) $25 and above (inclusive) $25

In [17]:
sql = """
    SELECT
        total_bill >= 25 as bill_above_25,
        AVG(tip/total_bill) * 100
    FROM
        tips
    GROUP BY
        total_bill >= 25
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for bill_above_25, avg_tip_percentage in res.fetchall():
        data_point = {
            "bill_above_25": bill_above_25,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "bill_above_25": 0,
        "avg_tip_percentage": 16.93724971937115
    },
    {
        "bill_above_25": 1,
        "avg_tip_percentage": 13.203215121202474
    }
]


### 13. (additional queries) average bill and average tip percentage of smoker and non-smoker

In [18]:
sql = """
    SELECT
        smoker,
        AVG(total_bill),
        AVG(tip/total_bill) * 100
    FROM
        tips
    GROUP BY
        smoker
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for smoker, avg_total_bill, avg_tip_percentage in res.fetchall():
        data_point = {
            "smoker": smoker,
            "avg_total_bill": avg_total_bill,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "smoker": "No",
        "avg_total_bill": 19.18827814569537,
        "avg_tip_percentage": 15.932846217921531
    },
    {
        "smoker": "Yes",
        "avg_total_bill": 20.756344086021507,
        "avg_tip_percentage": 16.31960446368779
    }
]


### 14. (additional queries) average bill and average tip percentage of male and female

In [19]:
sql = """
    SELECT
        sex,
        AVG(total_bill),
        AVG(tip/total_bill) * 100
    FROM
        tips
    GROUP BY
        sex
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for sex, avg_total_bill, avg_tip_percentage in res.fetchall():
        data_point = {
            "sex": sex,
            "avg_total_bill": avg_total_bill,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "sex": "Female",
        "avg_total_bill": 18.056896551724137,
        "avg_tip_percentage": 16.649073632892474
    },
    {
        "sex": "Male",
        "avg_total_bill": 20.744076433121034,
        "avg_tip_percentage": 15.765054700429747
    }
]


### 15. (additional queries) average bill and average tip percentage of different party size

In [20]:
sql = """
    SELECT
        size,
        AVG(total_bill),
        AVG(tip/total_bill) * 100
    FROM
        tips
    GROUP BY
        size
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for size, avg_total_bill, avg_tip_percentage in res.fetchall():
        data_point = {
            "party_size": size,
            "avg_total_bill": avg_total_bill,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "party_size": 1,
        "avg_total_bill": 7.2425,
        "avg_tip_percentage": 21.72920154872781
    },
    {
        "party_size": 2,
        "avg_total_bill": 16.448012820512833,
        "avg_tip_percentage": 16.571919173482897
    },
    {
        "party_size": 3,
        "avg_total_bill": 23.27763157894737,
        "avg_tip_percentage": 15.21568547371183
    },
    {
        "party_size": 4,
        "avg_total_bill": 28.61351351351351,
        "avg_tip_percentage": 14.594900639351332
    },
    {
        "party_size": 5,
        "avg_total_bill": 30.068,
        "avg_tip_percentage": 14.149548965142023
    },
    {
        "party_size": 6,
        "avg_total_bill": 34.83,
        "avg_tip_percentage": 15.622920072028379
    }
]


## UPDATE

It was determined that there was an error in the database. Please update the record that corresponds to id=10 and set smoker to Yes. 

In [21]:
sql = """
    UPDATE
        tips
    SET
        smoker='Yes'
    WHERE
        id=10
    ;
"""

try:
    with con:
        con.execute(sql)
except sqlite3.Error as e:
    print("couldn't update data: ", e)
else:
    print("Success: data updated!")

Success: data updated!


## DELETE

Delete records from the database that have a total bill that is less than $10. 

In [22]:
sql = """
    DELETE FROM
        tips
    WHERE
        total_bill < 10
    ;
"""

try:
    with con:
        con.execute(sql)
except sqlite3.Error as e:
    print("couldn't delete data: ", e)
else:
    print("Success: data deleted!")

Success: data deleted!


## Close connection

In [23]:
# at the end, we need to close the connection
con.close()